evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -1,25 +1,34 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
2
|
evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
6
|
-
evalscope/
|
|
7
|
-
evalscope/
|
|
8
|
-
evalscope/version.py,sha256=zdIprYl0JT22Bpk3tCmkfp1QLsi0mvROpxeTcgkCNHI,119
|
|
3
|
+
evalscope/config.py,sha256=1YfHXlIyYH70FQfi8TiUtpUH3VIRCh5YcbaayKZo5s4,6781
|
|
4
|
+
evalscope/constants.py,sha256=Tc74W89SxeeEzISDzO5IoxSo9A_F0LqjH0mOrcAYJXc,3737
|
|
5
|
+
evalscope/run.py,sha256=dL1deJ0J1RHW6X6ZStXzAVL7NwbjW6McfdOMkCpWrtc,7012
|
|
6
|
+
evalscope/summarizer.py,sha256=ZLFDHmi0Bgo18ouQsxuUl9vmIES9zkoapLLWRLhy19Q,5911
|
|
7
|
+
evalscope/version.py,sha256=wsTu-_Fq9Dmfg7bXg6eDVtNwZA5ui-MZ6IPs4EhytAc,119
|
|
9
8
|
evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
|
|
10
|
-
evalscope/app/app.py,sha256=
|
|
9
|
+
evalscope/app/app.py,sha256=8mSBp8qUCCmqupV4FEPMPdT9jL-bYu4DdH2qj8P0ktk,776
|
|
11
10
|
evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
|
|
12
|
-
evalscope/app/constants.py,sha256=
|
|
11
|
+
evalscope/app/constants.py,sha256=oG6tZ618zJcCnwZ5THnYL0gWTPDb5XKrnmdrWxY3Z4Q,385
|
|
12
|
+
evalscope/app/ui/__init__.py,sha256=IBxyQ2H-kSHoHJmXWDR8QMermvsMbiu673PQbXP_FnE,616
|
|
13
|
+
evalscope/app/ui/app_ui.py,sha256=FvpHsr4Lc0LAcwXIaVn9sUAAjO8QLNYCuojmKrjKvaE,2023
|
|
14
|
+
evalscope/app/ui/multi_model.py,sha256=7pe71PSaU7gnvogmCBMLUr_DUAgAeni12k4QcoHkFDs,15040
|
|
15
|
+
evalscope/app/ui/sidebar.py,sha256=JA0QbG2iPStK-lFy6x_AjOHlQdesmgXoS0OYJUJ_Wyg,1339
|
|
16
|
+
evalscope/app/ui/single_model.py,sha256=K5SU_S7WXWsbYLih2rQfRrVE50enzbCrq4rbhpo2uXo,9406
|
|
17
|
+
evalscope/app/ui/visualization.py,sha256=jXFX_-7woQkcAiQkPAIRwVv1kdRdXonn9IvmB8yzPDU,1102
|
|
18
|
+
evalscope/app/utils/data_utils.py,sha256=TMgiDu4MGvWgyd8G_nNOAOw39ZCRVFfRLLvrxCX_Ocw,6806
|
|
19
|
+
evalscope/app/utils/localization.py,sha256=rWEviBmcnhIpAA-cG8djbbUA6p1Y358c0dxge5Pqi1U,6131
|
|
20
|
+
evalscope/app/utils/text_utils.py,sha256=7DJow2W3Fna5Qny-AbwVRTWDh44ualONS5A5uUtesuk,3590
|
|
21
|
+
evalscope/app/utils/visualization.py,sha256=N9M7OV6lxcCvFtXmLBcUWw3RPlYZva7YH3rvhgTElqk,3522
|
|
13
22
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
23
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
15
24
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
16
25
|
evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
|
|
17
|
-
evalscope/backend/opencompass/backend_manager.py,sha256=
|
|
26
|
+
evalscope/backend/opencompass/backend_manager.py,sha256=POEYRmNlptoRYlTNcpRcHEXwqrYo34RW4TM_kf7wMQQ,10458
|
|
18
27
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
19
28
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
|
|
20
29
|
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
|
|
21
30
|
evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLFr6sw1oeoA,291
|
|
22
|
-
evalscope/backend/rag_eval/backend_manager.py,sha256=
|
|
31
|
+
evalscope/backend/rag_eval/backend_manager.py,sha256=iEer5IhEJ8nOXW_s3j6l5jvfLgBftcGQMAtJk69Wzdc,3521
|
|
23
32
|
evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
|
|
24
33
|
evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
|
|
25
34
|
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=2OdPj4gSUWdAGCfS9PHpPGbd6q5RqEyli2G6UGb1ffw,8888
|
|
@@ -53,15 +62,16 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8m
|
|
|
53
62
|
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
|
|
54
63
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
64
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
56
|
-
evalscope/backend/rag_eval/utils/embedding.py,sha256=
|
|
65
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=uqodHHvOKlza-bCLJ9Zkm8G1Jt2y2JT88jtIqCjA0sA,9379
|
|
57
66
|
evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
|
|
58
67
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
59
68
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
60
|
-
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=
|
|
61
|
-
evalscope/benchmarks/__init__.py,sha256=
|
|
69
|
+
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
|
|
70
|
+
evalscope/benchmarks/__init__.py,sha256=NVd_VvmkY36LxdHNmgeogSBwMFfWoLJAZF8vDg-CoFc,1308
|
|
62
71
|
evalscope/benchmarks/benchmark.py,sha256=uZ_-Y_wPhy6TxufWiElF4BwEWN93azT1JHtGRW8tR-w,2633
|
|
63
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
64
|
-
evalscope/benchmarks/
|
|
72
|
+
evalscope/benchmarks/data_adapter.py,sha256=UI4HpnJNYo18GXRiU0HwNUxjRfoSXlCB-xEBIGs2ckg,23914
|
|
73
|
+
evalscope/benchmarks/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
74
|
+
evalscope/benchmarks/utils.py,sha256=mIk8n6zVMICQ5JWMyEwUqwlkxva4L-oD5SZzpIKw1sI,1851
|
|
65
75
|
evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
76
|
evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
77
|
evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
|
|
@@ -71,16 +81,16 @@ evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z
|
|
|
71
81
|
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
|
|
72
82
|
evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
|
|
73
83
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
|
-
evalscope/benchmarks/aime/aime24_adapter.py,sha256=
|
|
75
|
-
evalscope/benchmarks/aime/aime25_adapter.py,sha256=
|
|
84
|
+
evalscope/benchmarks/aime/aime24_adapter.py,sha256=iwOvjB-hwUYFRNDTe8xuRCFxASh69gCzuU3Vz9qnsUs,2070
|
|
85
|
+
evalscope/benchmarks/aime/aime25_adapter.py,sha256=fNJXUSCxjGyvtX_gkp4bveC_oXHwr1VNQdUePAuwjIE,2071
|
|
76
86
|
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
|
-
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=
|
|
87
|
+
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=BLU3G7IB3gmIYiXtznzHjPIrvi65nYZwqSF7FFnP7Aw,4324
|
|
78
88
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
79
89
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
80
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
90
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=OO2khZxfgsRzYk64zLvq4yEbgPdQuvbIVPO4t0E4Hcc,6703
|
|
81
91
|
evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
|
-
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=
|
|
83
|
-
evalscope/benchmarks/arena_hard/utils.py,sha256=
|
|
92
|
+
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=iJeIh-xiQbuc3E8ea48DTCfDW_KnlGMdTeIek5AlKnk,6668
|
|
93
|
+
evalscope/benchmarks/arena_hard/utils.py,sha256=kRgKXdVt4Ep3XGOzUQpf9JThnp1OOt8oUQhvQEtOzRY,4596
|
|
84
94
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
85
95
|
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=IFu9XctrLNJcIFXK4jV3LmyqQCVb66z8YhL07Osc1TA,8623
|
|
86
96
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
@@ -111,19 +121,19 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
111
121
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
112
122
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
113
123
|
evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
|
-
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=
|
|
124
|
+
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=ThDOYrJY_RdXMLSC1S9lP-8zYd1syZWpcrXXV1ZPLVs,10100
|
|
115
125
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
116
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=V_TC_E0lKXaFcV_qIdrg2_iddmGJ4um8iIdaXVaK_EM,11146
|
|
117
127
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
118
128
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
119
129
|
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=kaZ8fZK2a9oVwpGRUA3wz3FkxtcTY_FkRDYrdLjDNro,8433
|
|
120
130
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
121
131
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
122
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
132
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=3oh79iFR006vnlpwjsRVO5cl6pOav00I5uU98DPCORM,10119
|
|
123
133
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
124
134
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
125
135
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
126
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
136
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ZBIZJZDSy-b9lTgm2-ZU2pEh053rveMwccI1fu6xpkc,7038
|
|
127
137
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
138
|
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=z_wbrA4yJoMwfg4TJkvEZB2aV5cPFcxCZ3JIj49F4Do,2604
|
|
129
139
|
evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -135,10 +145,13 @@ evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46
|
|
|
135
145
|
evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
136
146
|
evalscope/benchmarks/frames/frames_adapter.py,sha256=xYvxGzqj_YPDSZYogP9TxUhOxvZFbud1S2SOvz1nlDU,3136
|
|
137
147
|
evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
|
|
148
|
+
evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
|
+
evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=j2aDzikz9obxvrR-damdvSCXR0rfjEo-OzX8vujj2N0,19887
|
|
150
|
+
evalscope/benchmarks/general_arena/utils.py,sha256=u0q4FNIOFka1_gC344OCvBXUz89Ah6M8asjIXbNSweM,7188
|
|
138
151
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
139
|
-
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=
|
|
152
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=cPN-p0tndjocQYqfc6OFkT5k8KL7kkVklmOtps-F08Y,5391
|
|
140
153
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
141
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
154
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=dpIGe635CoW4ejVohVwcarBxSckqvlnxcJ2ElpRlQ9o,5669
|
|
142
155
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
143
156
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
144
157
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=J6RfxpUT1l8Jj3vT_Vtsn1z8MKCg32XTlKn_eihCI50,5071
|
|
@@ -147,40 +160,42 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
|
|
|
147
160
|
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=IBMdsvQ1w45_raCiACTBm7DVHtOYfckv8x15_OXIwTI,10752
|
|
148
161
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
149
162
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
150
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
163
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=kgHz-n8_93J8DdR7XBlzfM2KDRoKcvg80h6CCjWv_Xk,6191
|
|
164
|
+
evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
|
+
evalscope/benchmarks/hle/hle_adapter.py,sha256=ts38e-AqtUcbfc6VqRtWLacZDh7KzSm4rj7xKm9vTFc,4445
|
|
151
166
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
152
167
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
153
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
168
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=ZqNG3L8yMY44B7HleUjlSbVG-GLk9RBsvaGWOm2fQVw,4788
|
|
154
169
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
170
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=2oStqiTD4w2f2n0kbjcbg7GJQfKCsHFieokQcNndWb4,2041
|
|
156
171
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
157
172
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
158
173
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
159
174
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
160
175
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
161
|
-
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=
|
|
176
|
+
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=kYXKiiFa_F5Gl3mIOAtKxXW5myi0VW_XGidbSjArd6M,2730
|
|
162
177
|
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
163
178
|
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=iqmVUMZmyRhzOOXXQ-NN9P1nGvvbzTjOSEp6djbN_rw,6503
|
|
164
179
|
evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
|
|
165
|
-
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=
|
|
180
|
+
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=a4Vz73V1q8A0tV_DeKnTJKVxnDWmXs84diaqSym8gLM,3550
|
|
166
181
|
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
167
182
|
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
168
183
|
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
169
184
|
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=abjlwp6HDayf88mMI_daOKm06nEOeNBaMkmGWqk2DJo,17286
|
|
170
185
|
evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
|
-
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=
|
|
186
|
+
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=aibJmtIJkpvWlyLBiiL7TCdjUGfW8pxkAU2KQEZDIPM,3149
|
|
172
187
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
173
|
-
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=
|
|
188
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=Oc9XnBgMAjEerYAk3GtY2TTKm1QH_UI896kUuW2_a5Y,2324
|
|
174
189
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
175
190
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
176
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
191
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=Rhi-J6oGWawRVBk38ZgXk8-XrZ7wL8sf4zrncU73jgs,12111
|
|
177
192
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
178
193
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
194
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=uglOOZBZfQBIuJOG7iT4THk2LNcfHQoakxQDpS4jB1U,4554
|
|
180
195
|
evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
181
|
-
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=
|
|
196
|
+
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=fYtAVKKGGfzRnDlEzU7IULruj2vYzey9aWoyZBBeftc,9886
|
|
182
197
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
183
|
-
evalscope/benchmarks/musr/musr_adapter.py,sha256=
|
|
198
|
+
evalscope/benchmarks/musr/musr_adapter.py,sha256=YTRFGsVuogdYlZoylfD3ij4AbyYrvT4hpY7MueVfu6c,2691
|
|
184
199
|
evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
185
200
|
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=AybH_Ka2B2WCh-EvwAsMPlCGzJ78dHBhe5sJ6nDgNK4,15691
|
|
186
201
|
evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
|
|
@@ -189,7 +204,7 @@ evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcw
|
|
|
189
204
|
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ULuXG68ifTEc_ucH_cj0p5AGdbL-ahA7kcJ-AzYVmSM,3767
|
|
190
205
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
191
206
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
192
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
207
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=FW_FSUGq5Iyz2cTACdk3qOqDt2kXwtCpVB9FT_Bc6LM,6635
|
|
193
208
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
194
209
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
195
210
|
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=gQzrmslukHOJf-VBSnVKYddIg34EEOvQuGYTurQgBy0,9289
|
|
@@ -198,18 +213,20 @@ evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=CQxRszzUrSIygOSd1G10
|
|
|
198
213
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=ce99v28wkhlGnfmihwpv3ikTqy3aumT8Jzm1LGxz-ck,10147
|
|
199
214
|
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
200
215
|
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=XZb0CN83YbfH2dF-iIV-ciNLbIb3ON220qHe7zf8KF0,247
|
|
216
|
+
evalscope/benchmarks/tau_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
217
|
+
evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=5_VgRUtEjeZ-8gRZj4cnwwso1GUqf2GB49AlI4xqyDM,4221
|
|
201
218
|
evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
202
|
-
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=
|
|
219
|
+
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=fy6Hb84cm6s-pOoQXmT-N8D1OUYVGCuq77-2xwM_WLA,3093
|
|
203
220
|
evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
|
|
204
221
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
205
222
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
206
223
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
207
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
224
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=IT5l6cFzZQi2i68kp8rWBdXWxiDVd14MORgk-lusPBM,5516
|
|
208
225
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
209
226
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
210
227
|
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=xY4Kr-GzyyE_TWGlaKL5mo9qTaza0frWLy7EgIwlZn4,12958
|
|
211
228
|
evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
212
|
-
evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=
|
|
229
|
+
evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=GkmTsrlpU1IA-E7dJXmsHXyY9ivRbmbeVKxFmMwWtLc,2209
|
|
213
230
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
214
231
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
215
232
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
@@ -221,15 +238,13 @@ evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCe
|
|
|
221
238
|
evalscope/collections/evaluator.py,sha256=RJ337S0sy8dsV25I2OAxeWgSx_HrmXTyuuHKSt9vQtM,17474
|
|
222
239
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
223
240
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
224
|
-
evalscope/evaluator/__init__.py,sha256=
|
|
225
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
226
|
-
evalscope/
|
|
227
|
-
evalscope/
|
|
228
|
-
evalscope/
|
|
229
|
-
evalscope/metrics/__init__.py,sha256=g96dZSt3Dh56TdVbe4yDqcfmr9DoLqH-R2__3Qvorjk,1497
|
|
230
|
-
evalscope/metrics/llm_judge.py,sha256=O2IaJpsBe1HqfCVnRYOt_PLWg6w85DYlYLU7yTq5idw,4384
|
|
241
|
+
evalscope/evaluator/__init__.py,sha256=XqPnEp5MvfRwC5M5cEeOAC0-MMEPxBIESqiSa3YMBgo,84
|
|
242
|
+
evalscope/evaluator/evaluator.py,sha256=HKEF2k0S_dJR8cF9lrqf_W4diXbb6H3L81pD6XcmLiA,22481
|
|
243
|
+
evalscope/metrics/__init__.py,sha256=CH3bNyRx9dJ3gOqNwKDlaZ7zan4MShM0h8SnzarjokU,1851
|
|
244
|
+
evalscope/metrics/completion_parsers.py,sha256=56ZNzOfNU0O1ba9fs9Cyi4Vk_YUmcgWUbxW0SJ2KrlU,8974
|
|
245
|
+
evalscope/metrics/llm_judge.py,sha256=1hPFnGc3Szszqo21O618a7mxOgkdba3KsbZ66vvTbSA,8380
|
|
231
246
|
evalscope/metrics/math_parser.py,sha256=JtOkj28XOtwoUACXOXLzCeRYz0rx0tBsQLQDU8cbC20,17311
|
|
232
|
-
evalscope/metrics/metrics.py,sha256=
|
|
247
|
+
evalscope/metrics/metrics.py,sha256=OLfvEljGbQnv-bBiFD-GR2On4mpZ0xhKxiKkjZfoDX8,14268
|
|
233
248
|
evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
|
|
234
249
|
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
235
250
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
@@ -335,75 +350,57 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.p
|
|
|
335
350
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py,sha256=LqMHlUTy2LEzoVwjALtrAw0UYmzIuHnFjQiVmn5nv-I,605
|
|
336
351
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
|
|
337
352
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py,sha256=XzebAHBAjOpkIMZm43dd55PESgmyq_J45Ji6bogYR3s,11204
|
|
338
|
-
evalscope/models/__init__.py,sha256=
|
|
353
|
+
evalscope/models/__init__.py,sha256=x0Sna8mbujdOVqIYSGwIULbiPOue_Ifp-2JElSZsuMs,1481
|
|
339
354
|
evalscope/models/local_model.py,sha256=UWsmZlWpT8JNGjijzZQKirvq4YywBkKOS9G-U2cuxAw,4115
|
|
340
|
-
evalscope/models/
|
|
341
|
-
evalscope/models/
|
|
342
|
-
evalscope/models/adapters/
|
|
343
|
-
evalscope/models/adapters/
|
|
344
|
-
evalscope/models/adapters/
|
|
345
|
-
evalscope/models/adapters/
|
|
346
|
-
evalscope/models/adapters/
|
|
347
|
-
evalscope/models/adapters/
|
|
348
|
-
evalscope/models/adapters/
|
|
349
|
-
evalscope/models/adapters/
|
|
355
|
+
evalscope/models/register.py,sha256=G35J6BULFWwuqZO_rTkKBru1llZAyfPztcAASp_cb8M,1257
|
|
356
|
+
evalscope/models/adapters/__init__.py,sha256=WRaZsHlnz0MvGg9Jq565-XJjED-4cAyu4KbmrOhrHO4,688
|
|
357
|
+
evalscope/models/adapters/base_adapter.py,sha256=P4aicNmz1nsX9QLY9t4c6OIQPzIYfOhcrqjlAjR-ENY,3477
|
|
358
|
+
evalscope/models/adapters/bfcl_adapter.py,sha256=cG0vsQ3H2pmabo6tC0Y5Gonw0ng5-RFljDyRBMSj6xE,10422
|
|
359
|
+
evalscope/models/adapters/chat_adapter.py,sha256=epxA_on9ipsak8Lnkweh9en2AjVm5G0L1ARXYmDEEbk,8026
|
|
360
|
+
evalscope/models/adapters/choice_adapter.py,sha256=wIXnDcgnKaIMdhToaqy6fidhuZDpEz2vhxIB_V9u3Z8,8203
|
|
361
|
+
evalscope/models/adapters/custom_adapter.py,sha256=W8DIBiMWvHHcc0Mn9Frjj1YbpHRi7w-UQVJDiU2PakU,2400
|
|
362
|
+
evalscope/models/adapters/server_adapter.py,sha256=W6SXrPy-hZXpnISDjupu_j7bnmt-cP55sDojPXThitc,9701
|
|
363
|
+
evalscope/models/adapters/t2i_adapter.py,sha256=d6OviQFi_uN8PPXKrFpivk5Awm1O6wd_Gii8t3hVahY,2806
|
|
364
|
+
evalscope/models/adapters/tau_bench_adapter.py,sha256=jYGaj2L2wxtEiTdiSwZdY1XNkSzm6os7IvkxgK4msR0,6889
|
|
350
365
|
evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
|
|
351
366
|
evalscope/models/custom/custom_model.py,sha256=rBccFVpCIfTGt9cgXLcxeUWc7w1sTRtbTO5w5qqQIQE,1405
|
|
352
|
-
evalscope/models/custom/dummy_model.py,sha256=
|
|
367
|
+
evalscope/models/custom/dummy_model.py,sha256=WpfrS3kvwRRdyThx9baaJ5vodYYh29VGRKsGKMWFflI,3124
|
|
353
368
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
354
|
-
evalscope/perf/arguments.py,sha256=
|
|
355
|
-
evalscope/perf/benchmark.py,sha256=
|
|
356
|
-
evalscope/perf/http_client.py,sha256
|
|
357
|
-
evalscope/perf/main.py,sha256=
|
|
358
|
-
evalscope/perf/plugin/__init__.py,sha256=
|
|
359
|
-
evalscope/perf/plugin/registry.py,sha256=
|
|
360
|
-
evalscope/perf/plugin/api/__init__.py,sha256=
|
|
361
|
-
evalscope/perf/plugin/api/base.py,sha256=
|
|
362
|
-
evalscope/perf/plugin/api/custom_api.py,sha256=
|
|
363
|
-
evalscope/perf/plugin/api/dashscope_api.py,sha256=
|
|
364
|
-
evalscope/perf/plugin/api/
|
|
365
|
-
evalscope/perf/plugin/
|
|
366
|
-
evalscope/perf/plugin/datasets/
|
|
367
|
-
evalscope/perf/plugin/datasets/
|
|
368
|
-
evalscope/perf/plugin/datasets/
|
|
369
|
-
evalscope/perf/plugin/datasets/
|
|
370
|
-
evalscope/perf/plugin/datasets/
|
|
371
|
-
evalscope/perf/plugin/datasets/
|
|
372
|
-
evalscope/perf/plugin/datasets/
|
|
369
|
+
evalscope/perf/arguments.py,sha256=lG2IOOzxg29pdnF6IobzPcqEcYqopulFpVU2QzRaEJA,11429
|
|
370
|
+
evalscope/perf/benchmark.py,sha256=ZVmsSeKDUKkApt3y5tIMMFZAyAj3UNVT7JPp1fh5mhE,7880
|
|
371
|
+
evalscope/perf/http_client.py,sha256=l_OKL80kTP6sM_PEBvsJ1_TejYJdUQnE2UlB-ud1WQM,4588
|
|
372
|
+
evalscope/perf/main.py,sha256=WZbBgFhIj9KqxzC7_NZxDlou019_EXatsHRt5vqDhFg,3439
|
|
373
|
+
evalscope/perf/plugin/__init__.py,sha256=Ztj4h1_JYJqbbWkeuDTj5aTRyGQf5Woc4xEIyjcokVU,94
|
|
374
|
+
evalscope/perf/plugin/registry.py,sha256=GhLe-h1rGzya2bgIUaV5VymQIaHqI7h5SG_i4PoGAm8,1967
|
|
375
|
+
evalscope/perf/plugin/api/__init__.py,sha256=7RsGdYTSfnW6iVpveEzNu8v4x8Yc8H-Kk39DqOHMrd4,152
|
|
376
|
+
evalscope/perf/plugin/api/base.py,sha256=9cX4xwTzy5ycnWqmQqRGMLasTEX6jVlobtADkh1KwXE,2782
|
|
377
|
+
evalscope/perf/plugin/api/custom_api.py,sha256=f8rUixcV9mTxoYyabu3wedEC4YVB70Yw6Az1NpfeWPQ,10375
|
|
378
|
+
evalscope/perf/plugin/api/dashscope_api.py,sha256=Miv2pzMa6sxZyYYJhCzcbOI_QHuZx7tazKpb6Not7ck,3627
|
|
379
|
+
evalscope/perf/plugin/api/default_api.py,sha256=kjuHQ-zRHe5WU4ofSzWBpWbIxBQBOh_ucu1z2g62gWg,4315
|
|
380
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=Mt_VedJUaCH3g-oVSJ_fsGcPk0KkspSzIMkrkih2Zb0,7777
|
|
381
|
+
evalscope/perf/plugin/datasets/__init__.py,sha256=qzeQ9BrJhiJJm1wHaFeOQkvXXdSd15Ucspbn5zjs-6Q,495
|
|
382
|
+
evalscope/perf/plugin/datasets/base.py,sha256=-3Ihnp2hYvZyPnP8Gh2Pu8ovlLNFHyZnNgRu3WHG4d0,2714
|
|
383
|
+
evalscope/perf/plugin/datasets/custom.py,sha256=UuOk8xYfSYyyYZL3U4grUjtfQhWHHZeAEC63n_4Siuw,1376
|
|
384
|
+
evalscope/perf/plugin/datasets/flickr8k.py,sha256=IXz5uu5SlqF1l_tJ_ITr2vx_R_d7gxWzqPuyEOx7rYo,1043
|
|
385
|
+
evalscope/perf/plugin/datasets/kontext_bench.py,sha256=XjKzr7nMzI3cfk83IH0PH1TNJaQMRXUpACnzFfP2n6g,1091
|
|
386
|
+
evalscope/perf/plugin/datasets/line_by_line.py,sha256=c3ydW4GqxkG0vl2g64jG0vBMql2FuFPyWh3mgkIh9Do,987
|
|
387
|
+
evalscope/perf/plugin/datasets/longalpaca.py,sha256=VnMjdHl_JV3NmZ6wRxVlJ99e8PYSjQTcVxoTkl21Ei0,1327
|
|
388
|
+
evalscope/perf/plugin/datasets/openqa.py,sha256=33AR419IrH-FxZRjjcYdAIEZXaX4TKEoirVVfX--N9I,1493
|
|
389
|
+
evalscope/perf/plugin/datasets/random_dataset.py,sha256=NNAXvgFPkLDOSpYNex1DyE4X-ELtQRm13_oBooO30j8,3514
|
|
390
|
+
evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=F3yA9Ih3YO895lZKCo3i85LeKTzjvGcvhzc8UNN-gUI,3240
|
|
373
391
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
374
392
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
375
|
-
evalscope/perf/utils/analysis_result.py,sha256=
|
|
376
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
377
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
393
|
+
evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
|
|
394
|
+
evalscope/perf/utils/benchmark_util.py,sha256=7bHpa5oaqcPJX7DSUkzK9assoFSHC27Q7-QylUOiklQ,7136
|
|
395
|
+
evalscope/perf/utils/db_util.py,sha256=TCdmoEx5iScL6h8wzucPojPwn6J1wTmQqX4sVk-ilHo,11630
|
|
378
396
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
379
397
|
evalscope/perf/utils/local_server.py,sha256=RL9rGd5tEniZ0aErhHcbVXMX22YmujfE11T3j37VL8k,4684
|
|
380
398
|
evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
|
|
381
399
|
evalscope/perf/utils/rich_display.py,sha256=xZzeryQbYM6Cv8g1ulK6OQUE2CalQ_KtFxiy7pioeEU,8127
|
|
382
|
-
evalscope/
|
|
383
|
-
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
384
|
-
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
385
|
-
evalscope/registry/config/cfg_pairwise_baseline.yaml,sha256=d05pBiqOk1ejcdd9XE-opZ_ersyttAesF3Iwa2df8O8,3580
|
|
386
|
-
evalscope/registry/config/cfg_single.yaml,sha256=zjsUC3zhU8z7JURaJiz7npkUbFpP82q1ycqUmObC-hc,3056
|
|
387
|
-
evalscope/registry/data/question.jsonl,sha256=WQw5FXvFYerdfwPK1L4YwrWX-TApeAr2X4Zxjznq-oc,12885
|
|
388
|
-
evalscope/registry/data/prompt_template/lmsys_v2.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
|
|
389
|
-
evalscope/registry/data/prompt_template/prompt_templates.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
|
|
390
|
-
evalscope/registry/data/qa_browser/battle.jsonl,sha256=2MXcYoMItBmttQxSMh2Oa0x51xxqJaWEgSuERUx1O_0,1185590
|
|
391
|
-
evalscope/registry/data/qa_browser/category_mapping.yaml,sha256=3r9nUIciW9205qbtOQF7aI_etM191cM3vlWU8ueG2Co,484
|
|
392
|
-
evalscope/registry/tasks/arc.yaml,sha256=MghUuCmZPEwGqwYhA8ClRWHiSwC3kbHcKMRicQl9aqc,765
|
|
393
|
-
evalscope/registry/tasks/bbh.yaml,sha256=GE3PpE8zw_SROj41LZ5bTm6ZXXZjYOorAdwBCTEePXM,604
|
|
394
|
-
evalscope/registry/tasks/bbh_mini.yaml,sha256=8o9ZiWaCTkN2uTwiOhjBQuyKm7GUw6ZfUZxb2bkOmvs,678
|
|
395
|
-
evalscope/registry/tasks/ceval.yaml,sha256=XDaszb7DROKk8nQDiklirTvDJwkOUJtIN_tcUFVvIJk,703
|
|
396
|
-
evalscope/registry/tasks/ceval_mini.yaml,sha256=4aYW4c0IzgAXSs5dp4d8dJ0OHVp5sD4uiRjChjL1zZg,672
|
|
397
|
-
evalscope/registry/tasks/cmmlu.yaml,sha256=yOgKl1jmfcAfTuUcIMmG5SQhkrbEHEyyP3YuCuIN3l0,703
|
|
398
|
-
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml,sha256=egdiM5oG7RSs0M-g8QNikwhJ9tZVgw5FiLy-rIYYHAA,737
|
|
399
|
-
evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNiAhGEdUqL-8c,702
|
|
400
|
-
evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
|
|
401
|
-
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
402
|
-
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
403
|
-
evalscope/report/__init__.py,sha256=mLCgT7G-WPagQHOGz97AOdLQJjyikrswDiXA8d9Wr_Q,923
|
|
400
|
+
evalscope/report/__init__.py,sha256=DIoXbj0mjs1m2kEgFvIyqy4skDuoBu0UDVmTDa60Ymk,905
|
|
404
401
|
evalscope/report/combinator.py,sha256=4ahUtTFPTNiSjamldX3IcLf33yKTJKs6ZsC4fsCafe8,4192
|
|
405
402
|
evalscope/report/generator.py,sha256=oykmQROG-Bt8ttCH4RtvmGJ39HmDJMTU6gG26lg5LHE,4321
|
|
406
|
-
evalscope/report/utils.py,sha256=
|
|
403
|
+
evalscope/report/utils.py,sha256=taTSLvMKzAtJ9oha7pe0WF2UZZfEqPQgdj4urq7ZJIE,8298
|
|
407
404
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
408
405
|
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
409
406
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
@@ -439,40 +436,39 @@ evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP
|
|
|
439
436
|
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
|
|
440
437
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
441
438
|
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
|
|
442
|
-
evalscope/utils/__init__.py,sha256=
|
|
443
|
-
evalscope/utils/
|
|
439
|
+
evalscope/utils/__init__.py,sha256=OiVmYHVkC_d8s6Zp1I6p6oTyhCEGvN-I9E6uzn8dgF4,1940
|
|
440
|
+
evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoYpvLk,1952
|
|
444
441
|
evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
|
|
445
|
-
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
446
442
|
evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
|
|
447
|
-
evalscope/utils/
|
|
448
|
-
evalscope/utils/
|
|
449
|
-
evalscope/utils/io_utils.py,sha256=atRCynX9dFcZGxCDip8HRpdzVkkTXCK6y4HzfiOEFU8,5615
|
|
443
|
+
evalscope/utils/import_utils.py,sha256=BSdp7RQSZu67129TBbtJvMWU0CfCFu864K31eiM3pr8,2975
|
|
444
|
+
evalscope/utils/io_utils.py,sha256=2eEkLx4jhekgIV4vYL8yTN0PT6dbHUERMBZwmvxuiEc,7109
|
|
450
445
|
evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
|
|
451
|
-
evalscope/utils/model_utils.py,sha256=
|
|
452
|
-
evalscope/utils/utils.py,sha256=P5gmpINv5UQrwEMrFZKZjdJspsOdGjaBARfRSDVNOd0,11414
|
|
446
|
+
evalscope/utils/model_utils.py,sha256=F1_WBHvBehWqrTd6kPtKICeeYucaZn5H0Gc3cCplYB8,2329
|
|
453
447
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
454
448
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
449
|
+
tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
|
|
455
450
|
tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
456
|
-
tests/aigc/test_t2i.py,sha256=
|
|
451
|
+
tests/aigc/test_t2i.py,sha256=XtVknpwlVMb6FSw3_WMFxMq0gZX6iG-ffdSQkcW2Fzw,3856
|
|
457
452
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
458
|
-
tests/cli/test_all.py,sha256=
|
|
459
|
-
tests/cli/test_collection.py,sha256=
|
|
460
|
-
tests/cli/
|
|
453
|
+
tests/cli/test_all.py,sha256=IT0mxjiuHCC0PpT4z3oN1Bbr_0viMcm8GnShZ02kp8w,6333
|
|
454
|
+
tests/cli/test_collection.py,sha256=bXWzccH822Y2B1Ed251U6TE8G_osI6MXYNxzmfv9kBI,4197
|
|
455
|
+
tests/cli/test_custom.py,sha256=0YE-TCAeaQMRVRFla_TIvTd8d0USvvsSeqvYAD3NDNg,8796
|
|
456
|
+
tests/cli/test_run.py,sha256=YKX2XCHPxnStgzzP67U90RV9r1MC2GM3JoGQqfZKqrI,17324
|
|
461
457
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
462
|
-
tests/perf/test_perf.py,sha256=
|
|
458
|
+
tests/perf/test_perf.py,sha256=AEWvpN3ID6s-9MEoaZjQqUM8VVsqgk_v9KX8pDgvozA,5864
|
|
463
459
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
464
|
-
tests/rag/test_clip_benchmark.py,sha256=
|
|
465
|
-
tests/rag/test_mteb.py,sha256=
|
|
466
|
-
tests/rag/test_ragas.py,sha256=
|
|
460
|
+
tests/rag/test_clip_benchmark.py,sha256=13pcY3gYHNQh2KfEHCqtCSqiOcbngSJ1BlVZzI58JCE,2694
|
|
461
|
+
tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
|
|
462
|
+
tests/rag/test_ragas.py,sha256=5qozXvPFIb67T-igJv87ijlOgkPnqgkkBVXu6Ht4D0A,4554
|
|
467
463
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
468
|
-
tests/swift/test_run_swift_eval.py,sha256=
|
|
469
|
-
tests/swift/test_run_swift_vlm_eval.py,sha256=
|
|
470
|
-
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=
|
|
464
|
+
tests/swift/test_run_swift_eval.py,sha256=YbIhYNoI4kAB-ox-OXAKUifLIXTFqP-xGZicrAgK_V0,5784
|
|
465
|
+
tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4VlwL03atI,4934
|
|
466
|
+
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
|
|
471
467
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
472
|
-
tests/vlm/test_vlmeval.py,sha256=
|
|
473
|
-
evalscope-0.
|
|
474
|
-
evalscope-0.
|
|
475
|
-
evalscope-0.
|
|
476
|
-
evalscope-0.
|
|
477
|
-
evalscope-0.
|
|
478
|
-
evalscope-0.
|
|
468
|
+
tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
|
|
469
|
+
evalscope-0.17.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
470
|
+
evalscope-0.17.1.dist-info/METADATA,sha256=1PRiimjOBZgSWjvT3iL4VcvdaWk8v3fGp9xCXLpM1Dw,38469
|
|
471
|
+
evalscope-0.17.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
472
|
+
evalscope-0.17.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
473
|
+
evalscope-0.17.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
474
|
+
evalscope-0.17.1.dist-info/RECORD,,
|
tests/aigc/test_t2i.py
CHANGED
|
@@ -8,8 +8,8 @@ import unittest
|
|
|
8
8
|
from evalscope.config import TaskConfig
|
|
9
9
|
from evalscope.constants import EvalType, JudgeStrategy, ModelTask, OutputType
|
|
10
10
|
from evalscope.run import run_task
|
|
11
|
-
from evalscope.utils import test_level_list
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
12
|
+
from tests.utils import test_level_list
|
|
13
13
|
|
|
14
14
|
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
15
15
|
|
tests/cli/test_all.py
CHANGED
|
@@ -9,8 +9,8 @@ import unittest
|
|
|
9
9
|
from evalscope.config import TaskConfig
|
|
10
10
|
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
11
11
|
from evalscope.run import run_task
|
|
12
|
-
from evalscope.utils import test_level_list
|
|
13
12
|
from evalscope.utils.logger import get_logger
|
|
13
|
+
from tests.utils import test_level_list
|
|
14
14
|
|
|
15
15
|
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
16
16
|
|
|
@@ -39,7 +39,7 @@ datasets=[
|
|
|
39
39
|
'general_mcq',
|
|
40
40
|
'general_qa',
|
|
41
41
|
'super_gpqa',
|
|
42
|
-
'live_code_bench',
|
|
42
|
+
# 'live_code_bench',
|
|
43
43
|
'mmlu_redux',
|
|
44
44
|
'simple_qa',
|
|
45
45
|
'chinese_simpleqa',
|
|
@@ -53,8 +53,13 @@ datasets=[
|
|
|
53
53
|
'docmath',
|
|
54
54
|
'needle_haystack',
|
|
55
55
|
'bfcl_v3',
|
|
56
|
+
'hle',
|
|
57
|
+
'tau_bench',
|
|
56
58
|
]
|
|
57
59
|
|
|
60
|
+
# Reverse the datasets list to ensure the order is from most recent to oldest
|
|
61
|
+
datasets.reverse()
|
|
62
|
+
|
|
58
63
|
dataset_args={
|
|
59
64
|
'mmlu': {
|
|
60
65
|
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
@@ -127,12 +132,23 @@ dataset_args={
|
|
|
127
132
|
'mmlu_redux':{
|
|
128
133
|
'subset_list': ['abstract_algebra']
|
|
129
134
|
},
|
|
130
|
-
'
|
|
135
|
+
'docmath':{
|
|
131
136
|
'subset_list': ['simpshort_testmini']
|
|
132
137
|
},
|
|
133
138
|
'bfcl_v3':{
|
|
134
139
|
'subset_list': ['simple', 'multiple']
|
|
135
|
-
}
|
|
140
|
+
},
|
|
141
|
+
'hle': {
|
|
142
|
+
'subset_list': ['Math', 'Other'],
|
|
143
|
+
},
|
|
144
|
+
'tau_bench': {
|
|
145
|
+
'extra_params': {
|
|
146
|
+
'user_model': 'qwen-plus',
|
|
147
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
148
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
149
|
+
},
|
|
150
|
+
'subset_list': ['airline'],
|
|
151
|
+
},
|
|
136
152
|
}
|
|
137
153
|
|
|
138
154
|
class TestRun(unittest.TestCase):
|
|
@@ -165,3 +181,51 @@ class TestRun(unittest.TestCase):
|
|
|
165
181
|
)
|
|
166
182
|
|
|
167
183
|
run_task(task_cfg=task_cfg)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
187
|
+
def test_ci_lite(self):
|
|
188
|
+
from evalscope.config import TaskConfig
|
|
189
|
+
|
|
190
|
+
task_cfg = TaskConfig(
|
|
191
|
+
model='qwen-plus',
|
|
192
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
193
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
194
|
+
eval_type=EvalType.SERVICE,
|
|
195
|
+
datasets=[
|
|
196
|
+
'general_mcq',
|
|
197
|
+
'general_qa',
|
|
198
|
+
'iquiz',
|
|
199
|
+
],
|
|
200
|
+
dataset_args={
|
|
201
|
+
'general_mcq': {
|
|
202
|
+
'local_path': 'custom_eval/text/mcq',
|
|
203
|
+
'subset_list': [
|
|
204
|
+
'example'
|
|
205
|
+
],
|
|
206
|
+
},
|
|
207
|
+
'general_qa': {
|
|
208
|
+
'local_path': 'custom_eval/text/qa',
|
|
209
|
+
'subset_list': [
|
|
210
|
+
'example'
|
|
211
|
+
]
|
|
212
|
+
}
|
|
213
|
+
},
|
|
214
|
+
eval_batch_size=1,
|
|
215
|
+
limit=1,
|
|
216
|
+
stream=True,
|
|
217
|
+
generation_config={
|
|
218
|
+
'temperature': 0,
|
|
219
|
+
'n': 1,
|
|
220
|
+
'max_tokens': 4096,
|
|
221
|
+
},
|
|
222
|
+
judge_worker_num=1,
|
|
223
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
224
|
+
judge_model_args={
|
|
225
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
226
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
227
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
228
|
+
}
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_collection.py
CHANGED
|
@@ -5,7 +5,7 @@ import unittest
|
|
|
5
5
|
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
|
6
6
|
from evalscope.constants import EvalType, JudgeStrategy
|
|
7
7
|
from evalscope.utils.io_utils import dump_jsonl_data
|
|
8
|
-
from
|
|
8
|
+
from tests.utils import test_level_list
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestCollection(unittest.TestCase):
|