evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +3 -1
- evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +14 -17
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
- evalscope/benchmarks/benchmark.py +12 -10
- evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
- evalscope/benchmarks/data_adapter.py +82 -19
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
- evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
- evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
- evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +71 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +90 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
- evalscope/benchmarks/utils.py +43 -0
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +16 -1
- evalscope/config.py +13 -3
- evalscope/constants.py +7 -0
- evalscope/evaluator/evaluator.py +3 -1
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/metrics.py +23 -2
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +32 -6
- evalscope/models/chat_adapter.py +4 -1
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/local_model.py +3 -2
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +107 -29
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +18 -8
- evalscope/perf/http_client.py +8 -6
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +15 -8
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +429 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
- tests/cli/test_collection.py +1 -1
- tests/cli/test_run.py +151 -32
- /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=QT3f_oBDl1jXl68rgHVBsOxWeJTw1zXFmm7Zu1VRMQU,4826
|
|
3
|
+
evalscope/config.py,sha256=eQ_r94W_uQiF9ZWN-k84KxrT85E3YiJklDuM5mIKt_s,9124
|
|
4
|
+
evalscope/constants.py,sha256=l6xkVknVybi3frXaftksRZNaCFcw9ZJZ8ORJeWDJEaQ,3615
|
|
5
|
+
evalscope/run.py,sha256=ae6WsKllRt5xanRRFJWSBkVEjCf-Lgx35nlLyqOxctU,5785
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=KVyRitFqvCQM-1iaU2VOfx7rh9IDqOUGstYhQ6DLAI4,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,15 +56,17 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
61
|
-
evalscope/benchmarks/
|
|
62
|
-
evalscope/benchmarks/
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=AByXFsuia3lqCLFsPRt95UR7SxwEuAGpeuKBVjb7jLE,2463
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=JwptQHL4DbcZ_Ll0kJ0QL8rgK2ZVFftyAXiUWKcrvL4,15532
|
|
61
|
+
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
62
|
+
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
|
+
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
64
|
+
evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
|
|
63
65
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
64
66
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
65
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
67
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=8ksPc6IM266NE7F9Bo-Y9SRZZM-tlCKPfLbJg3VEq9w,6269
|
|
66
68
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
67
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
69
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
|
|
68
70
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
69
71
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
70
72
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -93,101 +95,114 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
93
95
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
94
96
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
95
97
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
96
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
98
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=B3nO0WmqSyH-LlicqreIPWrxXgVPt1rrp3ndc7YRYiE,11157
|
|
97
99
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
98
100
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
99
101
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
100
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
102
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=zNaYSelcGZulgFLQXp2eD56_QOFRkaXHknfy_VWJciA,10230
|
|
101
103
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
102
104
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
103
105
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
104
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
106
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=F2YCaNDn49X82l06WlLFp2OPFB7nv0ecW40099I9iSE,6871
|
|
105
107
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
106
|
-
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=
|
|
108
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS3-z03YW8nafooFJ7x60e5uEpBO5z_c7zk8,2450
|
|
107
109
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
|
-
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=
|
|
110
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
109
111
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
110
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
112
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=wnKUIVc1UvnjI5XGOHf5aCx0H0xTKoZZWAD-Q8AJNAE,4686
|
|
111
113
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
114
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
113
|
-
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=
|
|
115
|
+
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
114
116
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
115
117
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
116
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
118
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
|
|
117
119
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
118
120
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
119
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
121
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=2CnrIapK51l4bQyFKWWqmOaeBSpkIlq2asetWcp24gs,6057
|
|
120
122
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
121
123
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
122
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
124
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=YK4u3JG_Ub4vP-xnsrf-lMheIBdCgFWmirhPUch3biU,5120
|
|
123
125
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
|
|
125
127
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
126
128
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
127
129
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
128
130
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
129
131
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
130
|
-
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=
|
|
132
|
+
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
|
|
131
133
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
|
-
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=
|
|
134
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
133
135
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
134
136
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
135
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
137
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=RMZoHAApVOpD3_NeHLcsiM7SpglKpfrGSUhBWPgdAVE,11525
|
|
136
138
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
137
139
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
138
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
140
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
|
|
141
|
+
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
142
|
+
evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
|
|
143
|
+
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
144
|
+
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
145
|
+
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
|
|
139
146
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
140
147
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
141
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
148
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
142
149
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
150
|
+
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=FZwXN78X2fV3Dchop_UuFAhNFkwWs12qJlIczgvvrJ8,477
|
|
152
|
+
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
|
+
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
154
|
+
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
155
|
+
evalscope/benchmarks/super_gpqa/utils.py,sha256=uhANVnoIaH8-QuzjcVuyVB-8aGOMy94XKUF-TFemY_Q,3578
|
|
156
|
+
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
143
157
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
144
158
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
145
159
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
146
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
160
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb5BSyjO2eD4On6gX8xqlkV8,4961
|
|
147
161
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
148
162
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
149
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
163
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
|
|
150
164
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
151
165
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
152
166
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
153
|
-
evalscope/cli/start_app.py,sha256=
|
|
154
|
-
evalscope/cli/start_eval.py,sha256=
|
|
155
|
-
evalscope/cli/start_perf.py,sha256=
|
|
167
|
+
evalscope/cli/start_app.py,sha256=WTbba_Iitz1jkQ5n6KHRH-i3U7qJIM7iCi4a9roWjaA,808
|
|
168
|
+
evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,775
|
|
169
|
+
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
156
170
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
157
171
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
158
|
-
evalscope/collections/evaluator.py,sha256=
|
|
172
|
+
evalscope/collections/evaluator.py,sha256=Zi3uRZhSRIimYye_apZWL6VOiHqaM5znbFA4TBvqSbg,12761
|
|
159
173
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
160
174
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
161
175
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
162
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
176
|
+
evalscope/evaluator/evaluator.py,sha256=VIiw1eI46UOsFWNd7schD4ah_Q5ll0crl2sRmGIRmig,17649
|
|
163
177
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
164
178
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
165
179
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
166
|
-
evalscope/metrics/__init__.py,sha256=
|
|
180
|
+
evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
|
|
167
181
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
168
182
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
169
|
-
evalscope/metrics/metrics.py,sha256=
|
|
170
|
-
evalscope/metrics/named_metrics.py,sha256=
|
|
183
|
+
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
184
|
+
evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
|
|
171
185
|
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
172
186
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
173
187
|
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
|
|
174
188
|
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
|
|
175
189
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
176
|
-
evalscope/models/__init__.py,sha256=
|
|
177
|
-
evalscope/models/base_adapter.py,sha256=
|
|
178
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
179
|
-
evalscope/models/choice_adapter.py,sha256=
|
|
180
|
-
evalscope/models/custom_adapter.py,sha256=
|
|
181
|
-
evalscope/models/local_model.py,sha256=
|
|
190
|
+
evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
|
|
191
|
+
evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
|
|
192
|
+
evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
|
|
193
|
+
evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
|
|
194
|
+
evalscope/models/custom_adapter.py,sha256=Za52WF1I_YcJkGomJ6s9sP2Fs8DoJ4HHBYBi3iC3WNI,2379
|
|
195
|
+
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
182
196
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
183
|
-
evalscope/models/
|
|
197
|
+
evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
|
|
198
|
+
evalscope/models/server_adapter.py,sha256=dS_o9_iC8QY73AehIekYwBQieFECZ97JRfbfleJ-Dtk,6845
|
|
184
199
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
185
200
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
186
201
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
187
|
-
evalscope/perf/__init__.py,sha256=
|
|
188
|
-
evalscope/perf/arguments.py,sha256=
|
|
202
|
+
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
203
|
+
evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
|
|
189
204
|
evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
|
|
190
|
-
evalscope/perf/http_client.py,sha256=
|
|
205
|
+
evalscope/perf/http_client.py,sha256=eoRPaBTCVC4DpgH4tnc-31_h_2PVkWUwCLWK6_TTkhM,7282
|
|
191
206
|
evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
|
|
192
207
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
193
208
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
@@ -195,7 +210,7 @@ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2m
|
|
|
195
210
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
196
211
|
evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
|
|
197
212
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
198
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
213
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
|
|
199
214
|
evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
|
|
200
215
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
201
216
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
@@ -205,8 +220,8 @@ evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYF
|
|
|
205
220
|
evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
|
|
206
221
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
207
222
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
208
|
-
evalscope/perf/utils/analysis_result.py,sha256=
|
|
209
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
223
|
+
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
224
|
+
evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
|
|
210
225
|
evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
|
|
211
226
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
212
227
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
@@ -232,8 +247,8 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
232
247
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
233
248
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
234
249
|
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
235
|
-
evalscope/report/app.py,sha256=
|
|
236
|
-
evalscope/report/combinator.py,sha256=
|
|
250
|
+
evalscope/report/app.py,sha256=cvof2Nm4ORxC4D3L22Kg3Ngu3kJwBZlfnFJkwDMCmSQ,26881
|
|
251
|
+
evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
|
|
237
252
|
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
238
253
|
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
239
254
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -253,6 +268,14 @@ evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odT
|
|
|
253
268
|
evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
254
269
|
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
|
|
255
270
|
evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
|
|
271
|
+
evalscope/third_party/thinkbench/__init__.py,sha256=C0aSu71_dc1upUVkKmq2VgDd9plpRcYUdCE6BjUWJcA,110
|
|
272
|
+
evalscope/third_party/thinkbench/eval.py,sha256=76G4LTkxqWCDCyj7Ahjj-qjO1gFem1uDzpRAC27ICl0,18896
|
|
273
|
+
evalscope/third_party/thinkbench/infer.py,sha256=2L4DAJKn3wAhNEKnKudQT60igGOJSKH80FR4nS7DHYk,3952
|
|
274
|
+
evalscope/third_party/thinkbench/resources/critique_template.txt,sha256=d4Egc-qH--4lG8X_EcmgymnuZgiCMbee1M5pt4HrRKA,535
|
|
275
|
+
evalscope/third_party/thinkbench/resources/reformat_template.txt,sha256=zTZyVAzmMBtAwI9lHly9EXsqX471OW-VTg538PDcB30,1775
|
|
276
|
+
evalscope/third_party/thinkbench/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
277
|
+
evalscope/third_party/thinkbench/tools/llm.py,sha256=HCFh58_THsVrFVzvGoThwWRu8EbPXD0DotLQEj5u4Tg,1353
|
|
278
|
+
evalscope/third_party/thinkbench/tools/utils.py,sha256=rDu2GVTK4ji9Yh9RLVksZqrfurQsSuN9GW3QCKJ60ng,401
|
|
256
279
|
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
257
280
|
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
258
281
|
evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
|
|
@@ -262,20 +285,21 @@ evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo
|
|
|
262
285
|
evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
|
|
263
286
|
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
|
|
264
287
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
265
|
-
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=
|
|
288
|
+
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
|
|
266
289
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
267
290
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
268
|
-
evalscope/utils/chat_service.py,sha256=
|
|
291
|
+
evalscope/utils/chat_service.py,sha256=9LNTT-8KsacOLqnQer8j57e224rwOMbU7txV6re-X-A,8720
|
|
269
292
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
293
|
+
evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
270
294
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
271
|
-
evalscope/utils/logger.py,sha256=
|
|
272
|
-
evalscope/utils/model_utils.py,sha256=
|
|
273
|
-
evalscope/utils/utils.py,sha256=
|
|
295
|
+
evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
|
|
296
|
+
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
297
|
+
evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
|
|
274
298
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
275
299
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
276
300
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
277
|
-
tests/cli/test_collection.py,sha256
|
|
278
|
-
tests/cli/test_run.py,sha256=
|
|
301
|
+
tests/cli/test_collection.py,sha256=-CrcAiZVtsY7mXUNVlRjhFWEgmPL5k1dH9PjNhKzYdU,3028
|
|
302
|
+
tests/cli/test_run.py,sha256=flwZZ1PyMnrxy5f36mdUeGSO_ANpr2588dw1zHVQYJY,12735
|
|
279
303
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
280
304
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
281
305
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -288,9 +312,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
288
312
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
289
313
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
290
314
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
291
|
-
evalscope-0.
|
|
292
|
-
evalscope-0.
|
|
293
|
-
evalscope-0.
|
|
294
|
-
evalscope-0.
|
|
295
|
-
evalscope-0.
|
|
296
|
-
evalscope-0.
|
|
315
|
+
evalscope-0.12.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
316
|
+
evalscope-0.12.1.dist-info/METADATA,sha256=jdU1I5E3YNc8PLfY0NYYDTKiXzTE4HYtX5J6OUPkQ_s,31337
|
|
317
|
+
evalscope-0.12.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
318
|
+
evalscope-0.12.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
319
|
+
evalscope-0.12.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
320
|
+
evalscope-0.12.1.dist-info/RECORD,,
|
tests/cli/test_collection.py
CHANGED
|
@@ -44,7 +44,7 @@ class TestCollection(unittest.TestCase):
|
|
|
44
44
|
from evalscope import TaskConfig, run_task
|
|
45
45
|
|
|
46
46
|
task_cfg = TaskConfig(
|
|
47
|
-
model='Qwen2.5-
|
|
47
|
+
model='Qwen2.5-0.5B-Instruct',
|
|
48
48
|
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
49
49
|
api_key='EMPTY',
|
|
50
50
|
eval_type=EvalType.SERVICE,
|
tests/cli/test_run.py
CHANGED
|
@@ -4,7 +4,8 @@ import subprocess
|
|
|
4
4
|
import torch
|
|
5
5
|
import unittest
|
|
6
6
|
|
|
7
|
-
from evalscope.
|
|
7
|
+
from evalscope.config import TaskConfig
|
|
8
|
+
from evalscope.constants import EvalType, OutputType
|
|
8
9
|
from evalscope.run import run_task
|
|
9
10
|
from evalscope.utils import is_module_installed, test_level_list
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -71,21 +72,104 @@ class TestRun(unittest.TestCase):
|
|
|
71
72
|
|
|
72
73
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
73
74
|
def test_run_task(self):
|
|
74
|
-
task_cfg =
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
75
|
+
task_cfg = TaskConfig(
|
|
76
|
+
model='qwen/Qwen2.5-0.5B-Instruct',
|
|
77
|
+
datasets=[
|
|
78
|
+
'iquiz',
|
|
79
|
+
# 'ifeval',
|
|
80
|
+
# 'mmlu',
|
|
81
|
+
# 'mmlu_pro',
|
|
82
|
+
# 'musr',
|
|
83
|
+
# 'process_bench',
|
|
84
|
+
# 'race',
|
|
85
|
+
# 'trivia_qa',
|
|
86
|
+
# 'cmmlu',
|
|
87
|
+
# 'humaneval',
|
|
88
|
+
# 'super_gpqa',
|
|
89
|
+
# 'gsm8k',
|
|
90
|
+
# 'bbh',
|
|
91
|
+
# 'competition_math',
|
|
92
|
+
# 'math_500',
|
|
93
|
+
'aime24',
|
|
94
|
+
'gpqa',
|
|
95
|
+
# 'arc',
|
|
96
|
+
# 'ceval',
|
|
97
|
+
# 'hellaswag',
|
|
98
|
+
# 'general_mcq',
|
|
99
|
+
# 'general_qa'
|
|
100
|
+
],
|
|
101
|
+
dataset_args={
|
|
102
|
+
'mmlu': {
|
|
103
|
+
'subset_list': ['elementary_mathematics'],
|
|
104
|
+
'few_shot_num': 0
|
|
105
|
+
},
|
|
106
|
+
'mmlu_pro': {
|
|
107
|
+
'subset_list': ['math', 'health'],
|
|
108
|
+
'few_shot_num': 4
|
|
109
|
+
},
|
|
110
|
+
'ceval': {
|
|
111
|
+
'subset_list': [
|
|
112
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
113
|
+
],
|
|
114
|
+
'few_shot_num': 0
|
|
115
|
+
},
|
|
116
|
+
'cmmlu': {
|
|
117
|
+
'subset_list': ['elementary_chinese'],
|
|
118
|
+
'few_shot_num': 0
|
|
119
|
+
},
|
|
120
|
+
'bbh': {
|
|
121
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
122
|
+
},
|
|
123
|
+
'gpqa': {
|
|
124
|
+
'subset_list': ['gpqa_diamond'],
|
|
125
|
+
'few_shot_num': 0
|
|
126
|
+
},
|
|
127
|
+
'humaneval': {
|
|
128
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
129
|
+
},
|
|
130
|
+
'competition_math': {
|
|
131
|
+
'subset_list': ['Level 1']
|
|
132
|
+
},
|
|
133
|
+
'process_bench': {
|
|
134
|
+
'subset_list': ['gsm8k'],
|
|
135
|
+
},
|
|
136
|
+
'musr': {
|
|
137
|
+
'subset_list': ['murder_mysteries']
|
|
138
|
+
},
|
|
139
|
+
'general_mcq': {
|
|
140
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
141
|
+
'subset_list': [
|
|
142
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
143
|
+
],
|
|
144
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
145
|
+
},
|
|
146
|
+
'general_qa': {
|
|
147
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
148
|
+
'subset_list': [
|
|
149
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
150
|
+
# 'test'
|
|
151
|
+
],
|
|
152
|
+
'metric_list': ['AverageBLEU']
|
|
153
|
+
},
|
|
154
|
+
'super_gpqa': {
|
|
155
|
+
'subset_list': ['Philosophy', 'Education'],
|
|
156
|
+
'few_shot_num': 0
|
|
157
|
+
},
|
|
158
|
+
'ifeval': {
|
|
159
|
+
'filters': {
|
|
160
|
+
'remove_until': '</think>'
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
},
|
|
164
|
+
limit=2,
|
|
165
|
+
eval_batch_size=2,
|
|
166
|
+
generation_config={
|
|
167
|
+
'max_new_tokens': 2048,
|
|
168
|
+
'temperature': 0.7,
|
|
169
|
+
'num_return_sequences': 1,
|
|
170
|
+
},
|
|
171
|
+
# debug=True
|
|
172
|
+
)
|
|
89
173
|
run_task(task_cfg=task_cfg)
|
|
90
174
|
|
|
91
175
|
|
|
@@ -101,7 +185,8 @@ class TestRun(unittest.TestCase):
|
|
|
101
185
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
102
186
|
'subset_list': [
|
|
103
187
|
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
104
|
-
]
|
|
188
|
+
],
|
|
189
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
105
190
|
},
|
|
106
191
|
'general_qa': {
|
|
107
192
|
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
@@ -111,7 +196,8 @@ class TestRun(unittest.TestCase):
|
|
|
111
196
|
}
|
|
112
197
|
},
|
|
113
198
|
)
|
|
114
|
-
run_task(task_cfg=task_cfg)
|
|
199
|
+
res = run_task(task_cfg=task_cfg)
|
|
200
|
+
print(res)
|
|
115
201
|
|
|
116
202
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
117
203
|
def test_run_humaneval(self):
|
|
@@ -140,7 +226,7 @@ class TestRun(unittest.TestCase):
|
|
|
140
226
|
|
|
141
227
|
task_cfg = TaskConfig(
|
|
142
228
|
model='Qwen2.5-0.5B-Instruct',
|
|
143
|
-
api_url='http://127.0.0.1:8801/v1
|
|
229
|
+
api_url='http://127.0.0.1:8801/v1',
|
|
144
230
|
api_key='EMPTY',
|
|
145
231
|
eval_type=EvalType.SERVICE,
|
|
146
232
|
datasets=[
|
|
@@ -148,19 +234,24 @@ class TestRun(unittest.TestCase):
|
|
|
148
234
|
# 'ifeval',
|
|
149
235
|
# 'mmlu',
|
|
150
236
|
# 'mmlu_pro',
|
|
237
|
+
# 'musr',
|
|
238
|
+
# 'process_bench',
|
|
151
239
|
# 'race',
|
|
152
240
|
# 'trivia_qa',
|
|
153
241
|
# 'cmmlu',
|
|
154
242
|
# 'humaneval',
|
|
155
243
|
# 'gsm8k',
|
|
156
244
|
# 'bbh',
|
|
157
|
-
'competition_math',
|
|
158
|
-
'math_500',
|
|
159
|
-
'aime24',
|
|
245
|
+
# 'competition_math',
|
|
246
|
+
# 'math_500',
|
|
247
|
+
# 'aime24',
|
|
160
248
|
'gpqa',
|
|
161
249
|
# 'arc',
|
|
162
|
-
|
|
250
|
+
'ceval',
|
|
163
251
|
# 'hellaswag',
|
|
252
|
+
# 'general_mcq',
|
|
253
|
+
# 'general_qa'
|
|
254
|
+
# 'super_gpqa',
|
|
164
255
|
],
|
|
165
256
|
dataset_args={
|
|
166
257
|
'mmlu': {
|
|
@@ -168,8 +259,8 @@ class TestRun(unittest.TestCase):
|
|
|
168
259
|
'few_shot_num': 0
|
|
169
260
|
},
|
|
170
261
|
'mmlu_pro': {
|
|
171
|
-
'subset_list': ['math'],
|
|
172
|
-
'few_shot_num':
|
|
262
|
+
'subset_list': ['math', 'health'],
|
|
263
|
+
'few_shot_num': 4
|
|
173
264
|
},
|
|
174
265
|
'ceval': {
|
|
175
266
|
'subset_list': [
|
|
@@ -185,8 +276,9 @@ class TestRun(unittest.TestCase):
|
|
|
185
276
|
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
186
277
|
},
|
|
187
278
|
'gpqa': {
|
|
188
|
-
'subset_list': ['gpqa_diamond'],
|
|
189
|
-
'few_shot_num': 0
|
|
279
|
+
# 'subset_list': ['gpqa_diamond'],
|
|
280
|
+
'few_shot_num': 0,
|
|
281
|
+
'local_path': './data/data/gpqa',
|
|
190
282
|
},
|
|
191
283
|
'humaneval': {
|
|
192
284
|
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
@@ -194,15 +286,42 @@ class TestRun(unittest.TestCase):
|
|
|
194
286
|
'competition_math': {
|
|
195
287
|
'subset_list': ['Level 1']
|
|
196
288
|
},
|
|
289
|
+
'process_bench': {
|
|
290
|
+
'subset_list': ['gsm8k'],
|
|
291
|
+
},
|
|
292
|
+
'musr': {
|
|
293
|
+
'subset_list': ['murder_mysteries']
|
|
294
|
+
},
|
|
295
|
+
'general_mcq': {
|
|
296
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
297
|
+
'subset_list': [
|
|
298
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
299
|
+
],
|
|
300
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
301
|
+
},
|
|
302
|
+
'general_qa': {
|
|
303
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
304
|
+
'subset_list': [
|
|
305
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
306
|
+
# 'test'
|
|
307
|
+
],
|
|
308
|
+
'metric_list': ['AverageBLEU']
|
|
309
|
+
},
|
|
310
|
+
'super_gpqa': {
|
|
311
|
+
# 'subset_list': ['Philosophy', 'Education'],
|
|
312
|
+
'few_shot_num': 0
|
|
313
|
+
}
|
|
197
314
|
},
|
|
198
|
-
eval_batch_size=
|
|
315
|
+
eval_batch_size=32,
|
|
199
316
|
limit=10,
|
|
200
|
-
debug=True,
|
|
317
|
+
# debug=True,
|
|
318
|
+
stream=False,
|
|
201
319
|
generation_config={
|
|
202
|
-
'temperature': 0
|
|
203
|
-
'n':
|
|
320
|
+
'temperature': 0,
|
|
321
|
+
'n': 1,
|
|
322
|
+
'max_tokens': 4096,
|
|
204
323
|
},
|
|
205
|
-
use_cache='
|
|
324
|
+
# use_cache='./outputs/20250212_150525',
|
|
206
325
|
)
|
|
207
326
|
|
|
208
327
|
run_task(task_cfg=task_cfg)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|