evalscope 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +10 -0
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +4 -2
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +8 -4
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
- evalscope/benchmarks/tool_bench/utils.py +202 -0
- evalscope/benchmarks/utils.py +3 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/collections/evaluator.py +76 -26
- evalscope/config.py +46 -15
- evalscope/evaluator/evaluator.py +48 -14
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +3 -3
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +15 -19
- evalscope/perf/arguments.py +14 -5
- evalscope/perf/benchmark.py +4 -9
- evalscope/perf/main.py +69 -17
- evalscope/perf/utils/benchmark_util.py +33 -15
- evalscope/perf/utils/db_util.py +32 -20
- evalscope/perf/utils/log_utils.py +1 -1
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/app.py +47 -34
- evalscope/report/utils.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/METADATA +49 -25
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/RECORD +48 -38
- tests/aigc/test_t2i.py +4 -4
- tests/cli/test_all.py +3 -0
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +37 -14
- tests/perf/test_perf.py +27 -2
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=fZW-om5E2_JaFcEmkvahvundjedPLgIDde-zwDXinG0,5868
|
|
3
|
+
evalscope/config.py,sha256=19QaZ5VS8wknt4sLBxiZkR6pH-nm4Ph3Kl-1bZgcQcE,10799
|
|
4
4
|
evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
|
|
5
5
|
evalscope/run.py,sha256=_DKbxgQGwhweBnQrI7lQhu5eoz4LYPVeNanzD4lHuJA,6476
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=8STVV6Y877B3esrgvovInSk4IFNzxZ_ZEz9ND_6B2lQ,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -50,27 +50,27 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVf
|
|
|
50
50
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
52
|
evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
|
|
53
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
53
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
|
|
54
54
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
55
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
|
|
57
57
|
evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
|
|
58
58
|
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
59
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
60
|
-
evalscope/benchmarks/utils.py,sha256=
|
|
59
|
+
evalscope/benchmarks/data_adapter.py,sha256=lcBoXhI1Byn0HcwbVxmIeUFxZlz_wiqte6RDPOR8sbM,18184
|
|
60
|
+
evalscope/benchmarks/utils.py,sha256=jB9w3mN1eOur6j2kpQB_XZJ912fhzC0GaSeHOoylK7M,1087
|
|
61
61
|
evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
62
|
evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
|
|
64
64
|
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=WV9w3z8TxWNzVzn9A_g0xqeHh76ydnHL5xLwyg63VmU,2992
|
|
65
65
|
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py,sha256=baDGFRpVcSKpc1CdzNAMBtjeCZDUpyEc5l1KyrPNoEU,1892
|
|
66
|
-
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=
|
|
66
|
+
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z6fEAi0HfUqDZvaItQdS7dZ4,2097
|
|
67
67
|
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
|
|
68
68
|
evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
|
|
69
69
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
70
70
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=GrIxCHpUwgUy8tXGTB7iQOt8k7wG8MJB0CWbwBmIy-8,1703
|
|
71
71
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=yxo5roCb8ryX9ROUU2FdZ-WBTUPZ14MrBzEL0zPOh-U,1718
|
|
72
72
|
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
|
-
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=
|
|
73
|
+
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=oUHpWrt5Gx0jF80RBd7zTh_1AWI66YvDd6U1vOMoqj0,3828
|
|
74
74
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
75
75
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
76
76
|
evalscope/benchmarks/arc/arc_adapter.py,sha256=0h-eT4BBmUJQrakKMPUNE1nSRwK6LHB-cflWpWzY978,6364
|
|
@@ -110,7 +110,7 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
|
|
|
110
110
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1ITBXI0f01Dt1p7sb2RGswIeg9685Bkk2S2xmA1vat8,11295
|
|
111
111
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
112
112
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
|
-
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=
|
|
113
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=Q6ncuLrCUrrhhljIfMsgWnyhHfcWWwh8iA6NZvz3W28,8079
|
|
114
114
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
115
115
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
116
116
|
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=r9zael_Y2Jso0ashevYpF8e5SHOBh8iMcPIJU5WT3pQ,10367
|
|
@@ -120,10 +120,13 @@ evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc
|
|
|
120
120
|
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=wgejW-_QswtT8_3JKAQ_H6svH8IotDJDBEH7X4nP4bY,6760
|
|
121
121
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
122
122
|
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3q53T-lu1UWTV6T4h1cKGoCQDh0O4QxFezw,2569
|
|
123
|
+
evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
|
+
evalscope/benchmarks/drop/drop_adapter.py,sha256=V-Vx6g2_1kcDUDWOKVX1vPSLt5iHn8NQkpWbsIwPaa4,8325
|
|
125
|
+
evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
|
|
123
126
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
127
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
|
|
125
128
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
126
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
129
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=40mZovspVf-OXcuEu3ei6G_HZlYA8whAHSESHPPONxA,4750
|
|
127
130
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
131
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
129
132
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -174,12 +177,15 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
|
|
|
174
177
|
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
175
178
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
176
179
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
177
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
180
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=TD7hkMLGZ4GK7wD7cwqJ3jCcTAaixOakUy3o5DaPYHI,8997
|
|
178
181
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
182
|
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
180
183
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
181
184
|
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
182
185
|
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
186
|
+
evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
187
|
+
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=l2dBcJ4Z3m-8QFtfyFH4IqMtvkY3Rfk021P9Ff_lXWQ,2270
|
|
188
|
+
evalscope/benchmarks/tool_bench/utils.py,sha256=vIPsL8FmMF2JZRHCZeLS_dDeATKNRvZDbq6T-Znlk8Q,7025
|
|
183
189
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
184
190
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
185
191
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
@@ -187,6 +193,8 @@ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb
|
|
|
187
193
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
188
194
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
189
195
|
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
|
|
196
|
+
evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
197
|
+
evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=UdANz3YmCtV2YfGuEihTe3vpUTlIxeXBhIqGkKbTFdU,1956
|
|
190
198
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
191
199
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
192
200
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
@@ -195,22 +203,22 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
195
203
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
196
204
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
197
205
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
198
|
-
evalscope/collections/evaluator.py,sha256=
|
|
206
|
+
evalscope/collections/evaluator.py,sha256=3sz_bL0HMFkxq3C-4P6rNGrnQolifVISI5sEpT3Bt90,15754
|
|
199
207
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
200
208
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
201
209
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
202
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
210
|
+
evalscope/evaluator/evaluator.py,sha256=QzTFXiv_WdPpWTB3PgBNIz9KS_Rxu-fWDvoUpML23aA,21651
|
|
203
211
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
204
212
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
205
213
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
|
|
206
214
|
evalscope/metrics/__init__.py,sha256=y1sdj5FBKYW1q5kLC6QREzoITHwstJRUdji6p0X5aAE,1363
|
|
207
|
-
evalscope/metrics/llm_judge.py,sha256=
|
|
215
|
+
evalscope/metrics/llm_judge.py,sha256=qYHsoBz-zXjL57Czl9CaPcyJT5SZr05giv5Q9SFK3cY,4000
|
|
208
216
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
209
217
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
210
218
|
evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
|
|
211
|
-
evalscope/metrics/rouge_metric.py,sha256=
|
|
219
|
+
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
212
220
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
213
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
221
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=m7v8ZysO9zCuyThEoGTe5QNVt2GsKMgZpH6du1FQCvg,12110
|
|
214
222
|
evalscope/metrics/t2v_metrics/__init__.py,sha256=GBxgKTPVy_qhW_F3M4Oi6QMWhdAi4PqGX5w3t6Tueho,1783
|
|
215
223
|
evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
|
|
216
224
|
evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
|
|
@@ -318,19 +326,19 @@ evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,630
|
|
|
318
326
|
evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
|
|
319
327
|
evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
|
|
320
328
|
evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
|
|
321
|
-
evalscope/models/adapters/chat_adapter.py,sha256=
|
|
329
|
+
evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
|
|
322
330
|
evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
|
|
323
331
|
evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
|
|
324
|
-
evalscope/models/adapters/server_adapter.py,sha256=
|
|
332
|
+
evalscope/models/adapters/server_adapter.py,sha256=d-0ne7ymWXmvKf_ypJ0093RNwplZJwhvU2xRwc8rt70,6581
|
|
325
333
|
evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
|
|
326
334
|
evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
|
|
327
335
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
328
336
|
evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
|
|
329
337
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
330
|
-
evalscope/perf/arguments.py,sha256=
|
|
331
|
-
evalscope/perf/benchmark.py,sha256=
|
|
338
|
+
evalscope/perf/arguments.py,sha256=5dTtaBR9BIobaKkX1Xj-mphHDG4uugnGaVOvWpLfN04,10714
|
|
339
|
+
evalscope/perf/benchmark.py,sha256=eGnxMLQXSYBGRJS4tS8geSJAirnuWo35M4orlRZzei8,7847
|
|
332
340
|
evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
|
|
333
|
-
evalscope/perf/main.py,sha256=
|
|
341
|
+
evalscope/perf/main.py,sha256=clHzkQNmv7wv-OWkuNGDQ-8YoLUCWxARIX-Eisinpms,3096
|
|
334
342
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
335
343
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
336
344
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -349,11 +357,12 @@ evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANB
|
|
|
349
357
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
350
358
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
351
359
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
352
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
353
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
360
|
+
evalscope/perf/utils/benchmark_util.py,sha256=PcRTeKlEIslBw0zKVS2mFg6GgJ6J8m1f2-gAaEBeiHI,7236
|
|
361
|
+
evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
|
|
354
362
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
355
363
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
356
|
-
evalscope/perf/utils/log_utils.py,sha256=
|
|
364
|
+
evalscope/perf/utils/log_utils.py,sha256=Xm5A8g8BaozaI_0TaPzr2aAxUBCCf-w7II-FcifrIYg,1503
|
|
365
|
+
evalscope/perf/utils/rich_display.py,sha256=SavP2L44UwN58ZUGR2W1wxM4h4F1iyPa90HhT-Ypkzs,8125
|
|
357
366
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
358
367
|
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
359
368
|
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
@@ -376,11 +385,11 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
376
385
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
377
386
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
378
387
|
evalscope/report/__init__.py,sha256=iLNqx7CnHSHQmOBqWUK_vt2VIjnvGslJTqn--7B4y_s,316
|
|
379
|
-
evalscope/report/app.py,sha256=
|
|
388
|
+
evalscope/report/app.py,sha256=FxNpiEmbpH_B7D5SYN42idGsyOgkgFrLzScOVrwL3SI,28998
|
|
380
389
|
evalscope/report/app_arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
|
|
381
390
|
evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
|
|
382
391
|
evalscope/report/generator.py,sha256=q9aHWNjQgvutAKtpjfWOpfu5zNFdnXilO9OqBqt_Phg,3612
|
|
383
|
-
evalscope/report/utils.py,sha256=
|
|
392
|
+
evalscope/report/utils.py,sha256=uu-rAzoN6ZIlv52IDWSZCcmNVY3DscNo2f9H9-gjZHY,4602
|
|
384
393
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
385
394
|
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
386
395
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
@@ -413,13 +422,14 @@ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2ee
|
|
|
413
422
|
evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
|
|
414
423
|
evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
|
|
415
424
|
evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
|
|
416
|
-
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=
|
|
425
|
+
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
|
|
417
426
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
418
427
|
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
|
|
419
428
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
420
429
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
421
430
|
evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
|
|
422
431
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
432
|
+
evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
|
|
423
433
|
evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
424
434
|
evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
|
|
425
435
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
@@ -429,13 +439,13 @@ evalscope/utils/utils.py,sha256=hP_ntROFsZ-zaNVpJtT2prNo8iX-UAKfRtdxbLtPJng,1110
|
|
|
429
439
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
430
440
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
431
441
|
tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
432
|
-
tests/aigc/test_t2i.py,sha256=
|
|
442
|
+
tests/aigc/test_t2i.py,sha256=BcdS3OMypWnraXF4Cq3DhDVRpZq0qo9_0Qpyg54B7FY,2627
|
|
433
443
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
434
|
-
tests/cli/test_all.py,sha256=
|
|
435
|
-
tests/cli/test_collection.py,sha256=
|
|
436
|
-
tests/cli/test_run.py,sha256=
|
|
444
|
+
tests/cli/test_all.py,sha256=O3lXwOV7A0f0rmltofrjpphnshjNtaZC6NUPG-wsQjg,4082
|
|
445
|
+
tests/cli/test_collection.py,sha256=_11mSCWLaiCgheA3uguv6uI3CxqaHUKVwzS6T5BGmxs,4145
|
|
446
|
+
tests/cli/test_run.py,sha256=FTFiAb8Ge5raB1aa0Nzw8DPjFLyAlLfXHRQVIWjvvGE,17798
|
|
437
447
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
438
|
-
tests/perf/test_perf.py,sha256=
|
|
448
|
+
tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
|
|
439
449
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
440
450
|
tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
|
|
441
451
|
tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
|
|
@@ -446,9 +456,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
446
456
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
447
457
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
448
458
|
tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
|
|
449
|
-
evalscope-0.
|
|
450
|
-
evalscope-0.
|
|
451
|
-
evalscope-0.
|
|
452
|
-
evalscope-0.
|
|
453
|
-
evalscope-0.
|
|
454
|
-
evalscope-0.
|
|
459
|
+
evalscope-0.16.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
460
|
+
evalscope-0.16.0.dist-info/METADATA,sha256=zX2L_cLxOjX-NNbiR40dmPOxUWyOH86zJycYjr4j5Po,35492
|
|
461
|
+
evalscope-0.16.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
462
|
+
evalscope-0.16.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
463
|
+
evalscope-0.16.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
464
|
+
evalscope-0.16.0.dist-info/RECORD,,
|
tests/aigc/test_t2i.py
CHANGED
|
@@ -59,9 +59,9 @@ class TestRun(unittest.TestCase):
|
|
|
59
59
|
},
|
|
60
60
|
datasets=[
|
|
61
61
|
'tifa160',
|
|
62
|
-
'genai_bench',
|
|
63
|
-
'evalmuse',
|
|
64
|
-
'hpdv2',
|
|
62
|
+
# 'genai_bench',
|
|
63
|
+
# 'evalmuse',
|
|
64
|
+
# 'hpdv2',
|
|
65
65
|
],
|
|
66
66
|
dataset_args={
|
|
67
67
|
'tifa160': {
|
|
@@ -81,7 +81,7 @@ class TestRun(unittest.TestCase):
|
|
|
81
81
|
'num_inference_steps': 50,
|
|
82
82
|
'guidance_scale': 7.5
|
|
83
83
|
},
|
|
84
|
-
use_cache='outputs/20250427_134122',
|
|
84
|
+
# use_cache='outputs/20250427_134122',
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
run_task(task_cfg=task_cfg)
|
tests/cli/test_all.py
CHANGED
tests/cli/test_collection.py
CHANGED
|
@@ -78,7 +78,8 @@ class TestCollection(unittest.TestCase):
|
|
|
78
78
|
'model_id': 'qwen2.5-7b-instruct',
|
|
79
79
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
80
|
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
|
-
}
|
|
81
|
+
},
|
|
82
|
+
use_cache='outputs/20250519_114427'
|
|
82
83
|
)
|
|
83
84
|
res = run_task(task_cfg=task_cfg)
|
|
84
85
|
print(res)
|
tests/cli/test_run.py
CHANGED
|
@@ -73,6 +73,12 @@ class TestRun(unittest.TestCase):
|
|
|
73
73
|
logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
|
|
74
74
|
logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
|
|
75
75
|
|
|
76
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
77
|
+
def test_run_yaml_config(self):
|
|
78
|
+
from evalscope import run_task
|
|
79
|
+
|
|
80
|
+
run_task(task_cfg='examples/tasks/eval_native.yaml')
|
|
81
|
+
|
|
76
82
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
77
83
|
def test_run_task(self):
|
|
78
84
|
task_cfg = TaskConfig(
|
|
@@ -207,14 +213,18 @@ class TestRun(unittest.TestCase):
|
|
|
207
213
|
from evalscope.config import TaskConfig
|
|
208
214
|
|
|
209
215
|
task_cfg = TaskConfig(
|
|
210
|
-
model='Qwen/
|
|
216
|
+
model='Qwen/Qwen3-1.7B',
|
|
211
217
|
datasets=[
|
|
212
|
-
|
|
218
|
+
'iquiz',
|
|
213
219
|
# 'math_500',
|
|
214
220
|
# 'aime24',
|
|
215
221
|
# 'competition_math',
|
|
216
|
-
'mmlu',
|
|
222
|
+
# 'mmlu',
|
|
223
|
+
# 'simple_qa',
|
|
217
224
|
],
|
|
225
|
+
model_args={
|
|
226
|
+
'device_map': 'auto',
|
|
227
|
+
},
|
|
218
228
|
dataset_args={
|
|
219
229
|
'competition_math': {
|
|
220
230
|
'subset_list': ['Level 4', 'Level 5']
|
|
@@ -224,8 +234,16 @@ class TestRun(unittest.TestCase):
|
|
|
224
234
|
'few_shot_num': 0
|
|
225
235
|
},
|
|
226
236
|
},
|
|
227
|
-
limit=
|
|
228
|
-
eval_batch_size=
|
|
237
|
+
limit=5,
|
|
238
|
+
eval_batch_size=5,
|
|
239
|
+
generation_config={
|
|
240
|
+
'max_new_tokens': 1000, # 最大生成token数,建议设置为较大值避免输出截断
|
|
241
|
+
'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
|
|
242
|
+
'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
|
|
243
|
+
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
|
244
|
+
'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
|
|
245
|
+
},
|
|
246
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
229
247
|
)
|
|
230
248
|
|
|
231
249
|
run_task(task_cfg=task_cfg)
|
|
@@ -269,7 +287,7 @@ class TestRun(unittest.TestCase):
|
|
|
269
287
|
datasets=[
|
|
270
288
|
# 'iquiz',
|
|
271
289
|
# 'ifeval',
|
|
272
|
-
'mmlu',
|
|
290
|
+
# 'mmlu',
|
|
273
291
|
# 'mmlu_pro',
|
|
274
292
|
# 'musr',
|
|
275
293
|
# 'process_bench',
|
|
@@ -287,10 +305,13 @@ class TestRun(unittest.TestCase):
|
|
|
287
305
|
# 'ceval',
|
|
288
306
|
# 'hellaswag',
|
|
289
307
|
# 'general_mcq',
|
|
290
|
-
# 'general_qa'
|
|
308
|
+
# 'general_qa',
|
|
291
309
|
# 'super_gpqa',
|
|
292
310
|
# 'mmlu_redux',
|
|
293
|
-
# 'maritime_bench'
|
|
311
|
+
# 'maritime_bench',
|
|
312
|
+
# 'drop',
|
|
313
|
+
# 'winogrande',
|
|
314
|
+
'tool_bench',
|
|
294
315
|
],
|
|
295
316
|
dataset_args={
|
|
296
317
|
'mmlu': {
|
|
@@ -356,14 +377,16 @@ class TestRun(unittest.TestCase):
|
|
|
356
377
|
},
|
|
357
378
|
},
|
|
358
379
|
eval_batch_size=32,
|
|
359
|
-
limit=
|
|
380
|
+
limit=10,
|
|
360
381
|
debug=True,
|
|
361
382
|
stream=False,
|
|
362
383
|
generation_config={
|
|
363
384
|
'temperature': 0,
|
|
364
385
|
'n': 1,
|
|
365
386
|
'max_tokens': 4096,
|
|
366
|
-
}
|
|
387
|
+
},
|
|
388
|
+
# ignore_errors=True,
|
|
389
|
+
use_cache='outputs/20250519_142106'
|
|
367
390
|
)
|
|
368
391
|
|
|
369
392
|
run_task(task_cfg=task_cfg)
|
|
@@ -416,12 +439,12 @@ class TestRun(unittest.TestCase):
|
|
|
416
439
|
# 'gsm8k'
|
|
417
440
|
# 'truthful_qa',
|
|
418
441
|
# 'simple_qa',
|
|
419
|
-
|
|
442
|
+
'chinese_simpleqa',
|
|
420
443
|
# 'live_code_bench',
|
|
421
444
|
# 'humaneval',
|
|
422
445
|
# 'general_qa',
|
|
423
446
|
# 'alpaca_eval',
|
|
424
|
-
'arena_hard'
|
|
447
|
+
# 'arena_hard'
|
|
425
448
|
],
|
|
426
449
|
dataset_args={
|
|
427
450
|
'competition_math': {
|
|
@@ -447,7 +470,7 @@ class TestRun(unittest.TestCase):
|
|
|
447
470
|
]
|
|
448
471
|
},
|
|
449
472
|
},
|
|
450
|
-
eval_batch_size=
|
|
473
|
+
eval_batch_size=10,
|
|
451
474
|
limit=10,
|
|
452
475
|
judge_strategy=JudgeStrategy.AUTO,
|
|
453
476
|
judge_worker_num=5,
|
|
@@ -468,7 +491,7 @@ class TestRun(unittest.TestCase):
|
|
|
468
491
|
},
|
|
469
492
|
timeout=60000,
|
|
470
493
|
stream=True,
|
|
471
|
-
|
|
494
|
+
use_cache='outputs/20250519_142551'
|
|
472
495
|
)
|
|
473
496
|
|
|
474
497
|
run_task(task_cfg=task_cfg)
|
tests/perf/test_perf.py
CHANGED
|
@@ -103,7 +103,7 @@ class TestPerf(unittest.TestCase):
|
|
|
103
103
|
from evalscope.perf.arguments import Arguments
|
|
104
104
|
task_cfg = Arguments(
|
|
105
105
|
parallel=20,
|
|
106
|
-
model='
|
|
106
|
+
model='Qwen3-1.7B',
|
|
107
107
|
url='http://127.0.0.1:8801/v1/completions',
|
|
108
108
|
api='openai',
|
|
109
109
|
dataset='random',
|
|
@@ -117,8 +117,33 @@ class TestPerf(unittest.TestCase):
|
|
|
117
117
|
seed=None,
|
|
118
118
|
extra_args={'ignore_eos': True}
|
|
119
119
|
)
|
|
120
|
-
run_perf_benchmark(task_cfg)
|
|
120
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
121
|
+
print(metrics_result)
|
|
122
|
+
print(percentile_result)
|
|
121
123
|
|
|
124
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
125
|
+
def test_run_perf_multi_parallel(self):
|
|
126
|
+
from evalscope.perf.arguments import Arguments
|
|
127
|
+
task_cfg = Arguments(
|
|
128
|
+
parallel=[1, 2],
|
|
129
|
+
number=[2, 5],
|
|
130
|
+
model='qwen2.5-7b-instruct',
|
|
131
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
132
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
133
|
+
api='openai',
|
|
134
|
+
dataset='random',
|
|
135
|
+
min_tokens=100,
|
|
136
|
+
max_tokens=100,
|
|
137
|
+
prefix_length=0,
|
|
138
|
+
min_prompt_length=1024,
|
|
139
|
+
max_prompt_length=1024,
|
|
140
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
141
|
+
seed=None,
|
|
142
|
+
extra_args={'ignore_eos': True}
|
|
143
|
+
)
|
|
144
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
145
|
+
print(metrics_result)
|
|
146
|
+
print(percentile_result)
|
|
122
147
|
|
|
123
148
|
if __name__ == '__main__':
|
|
124
149
|
unittest.main(buffer=False)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|