evalscope 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (48) hide show
  1. evalscope/arguments.py +10 -0
  2. evalscope/backend/rag_eval/utils/llm.py +1 -1
  3. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  5. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  6. evalscope/benchmarks/data_adapter.py +4 -2
  7. evalscope/benchmarks/drop/__init__.py +0 -0
  8. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  9. evalscope/benchmarks/drop/utils.py +59 -0
  10. evalscope/benchmarks/general_qa/general_qa_adapter.py +8 -4
  11. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  12. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  13. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
  14. evalscope/benchmarks/tool_bench/utils.py +202 -0
  15. evalscope/benchmarks/utils.py +3 -2
  16. evalscope/benchmarks/winogrande/__init__.py +0 -0
  17. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  18. evalscope/collections/evaluator.py +76 -26
  19. evalscope/config.py +46 -15
  20. evalscope/evaluator/evaluator.py +48 -14
  21. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  22. evalscope/metrics/llm_judge.py +3 -3
  23. evalscope/metrics/rouge_metric.py +11 -13
  24. evalscope/models/adapters/chat_adapter.py +51 -34
  25. evalscope/models/adapters/server_adapter.py +15 -19
  26. evalscope/perf/arguments.py +14 -5
  27. evalscope/perf/benchmark.py +4 -9
  28. evalscope/perf/main.py +69 -17
  29. evalscope/perf/utils/benchmark_util.py +33 -15
  30. evalscope/perf/utils/db_util.py +32 -20
  31. evalscope/perf/utils/log_utils.py +1 -1
  32. evalscope/perf/utils/rich_display.py +186 -0
  33. evalscope/report/app.py +47 -34
  34. evalscope/report/utils.py +1 -1
  35. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  36. evalscope/utils/deprecation_utils.py +42 -0
  37. evalscope/version.py +2 -2
  38. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/METADATA +49 -25
  39. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/RECORD +48 -38
  40. tests/aigc/test_t2i.py +4 -4
  41. tests/cli/test_all.py +3 -0
  42. tests/cli/test_collection.py +2 -1
  43. tests/cli/test_run.py +37 -14
  44. tests/perf/test_perf.py +27 -2
  45. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
  46. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
  47. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
  48. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=jywTxu_HWhgf0_OlnaOyRSzUHenr5Zio2vmcCgcfbxg,5453
3
- evalscope/config.py,sha256=O3kjjVFRGSrlLD5EI4t99Z-m6oFtQVmEudvE62x92wY,9648
2
+ evalscope/arguments.py,sha256=fZW-om5E2_JaFcEmkvahvundjedPLgIDde-zwDXinG0,5868
3
+ evalscope/config.py,sha256=19QaZ5VS8wknt4sLBxiZkR6pH-nm4Ph3Kl-1bZgcQcE,10799
4
4
  evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
5
5
  evalscope/run.py,sha256=_DKbxgQGwhweBnQrI7lQhu5eoz4LYPVeNanzD4lHuJA,6476
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
8
- evalscope/version.py,sha256=X2BkdAHDhsMo9BTAegfd5uYheDVI8rh_UG5YqMwwXUE,119
8
+ evalscope/version.py,sha256=8STVV6Y877B3esrgvovInSk4IFNzxZ_ZEz9ND_6B2lQ,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -50,27 +50,27 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVf
50
50
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
52
52
  evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
53
- evalscope/backend/rag_eval/utils/llm.py,sha256=acaD5QHPJUstJGpW1sNJ-3ZPT5J_Z8beOWb61Rtz07U,2607
53
+ evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
54
54
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
55
55
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
57
57
  evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
58
58
  evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
59
- evalscope/benchmarks/data_adapter.py,sha256=mWdxtHbordS577NqZUQZmIjlewjGDlStqc-iDvqpAyU,18061
60
- evalscope/benchmarks/utils.py,sha256=yXQyszzrILNiBuUrbB1BtgotQSaNA8w6X935AL1dNAw,1074
59
+ evalscope/benchmarks/data_adapter.py,sha256=lcBoXhI1Byn0HcwbVxmIeUFxZlz_wiqte6RDPOR8sbM,18184
60
+ evalscope/benchmarks/utils.py,sha256=jB9w3mN1eOur6j2kpQB_XZJ912fhzC0GaSeHOoylK7M,1087
61
61
  evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
62
  evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
64
64
  evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=WV9w3z8TxWNzVzn9A_g0xqeHh76ydnHL5xLwyg63VmU,2992
65
65
  evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py,sha256=baDGFRpVcSKpc1CdzNAMBtjeCZDUpyEc5l1KyrPNoEU,1892
66
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=But2hcQU3X3v58poF8Qg2agrxTAP6gnjZYJs8Tr0g_4,2047
66
+ evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z6fEAi0HfUqDZvaItQdS7dZ4,2097
67
67
  evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
68
68
  evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
69
69
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
70
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=GrIxCHpUwgUy8tXGTB7iQOt8k7wG8MJB0CWbwBmIy-8,1703
71
71
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=yxo5roCb8ryX9ROUU2FdZ-WBTUPZ14MrBzEL0zPOh-U,1718
72
72
  evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=em1YM2PxnJ8Of7Li3eqrw8PtwfeXSinfVIr-CIKVb60,4026
73
+ evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=oUHpWrt5Gx0jF80RBd7zTh_1AWI66YvDd6U1vOMoqj0,3828
74
74
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
75
75
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
76
76
  evalscope/benchmarks/arc/arc_adapter.py,sha256=0h-eT4BBmUJQrakKMPUNE1nSRwK6LHB-cflWpWzY978,6364
@@ -110,7 +110,7 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
110
110
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1ITBXI0f01Dt1p7sb2RGswIeg9685Bkk2S2xmA1vat8,11295
111
111
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
112
112
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=zY8dfvrTeCHAQ3d7AM02CexZw5CVKH51ZOhtT7Q1Gko,8031
113
+ evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=Q6ncuLrCUrrhhljIfMsgWnyhHfcWWwh8iA6NZvz3W28,8079
114
114
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
115
115
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
116
116
  evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=r9zael_Y2Jso0ashevYpF8e5SHOBh8iMcPIJU5WT3pQ,10367
@@ -120,10 +120,13 @@ evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc
120
120
  evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=wgejW-_QswtT8_3JKAQ_H6svH8IotDJDBEH7X4nP4bY,6760
121
121
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
122
  evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3q53T-lu1UWTV6T4h1cKGoCQDh0O4QxFezw,2569
123
+ evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
+ evalscope/benchmarks/drop/drop_adapter.py,sha256=V-Vx6g2_1kcDUDWOKVX1vPSLt5iHn8NQkpWbsIwPaa4,8325
125
+ evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
123
126
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
127
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
125
128
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
126
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=8d5znAcQmFSmvyKV-JuMQzbY5k6xDNQQdrWZ7zgPTK4,4603
129
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=40mZovspVf-OXcuEu3ei6G_HZlYA8whAHSESHPPONxA,4750
127
130
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
131
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
129
132
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -174,12 +177,15 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
174
177
  evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
175
178
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
176
179
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
177
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=to4kSKc29BmtG4q9R2PeM-sdHiL8toSyoVi1D9WMRKk,8949
180
+ evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=TD7hkMLGZ4GK7wD7cwqJ3jCcTAaixOakUy3o5DaPYHI,8997
178
181
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
182
  evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
180
183
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
181
184
  evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
182
185
  evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
186
+ evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
187
+ evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=l2dBcJ4Z3m-8QFtfyFH4IqMtvkY3Rfk021P9Ff_lXWQ,2270
188
+ evalscope/benchmarks/tool_bench/utils.py,sha256=vIPsL8FmMF2JZRHCZeLS_dDeATKNRvZDbq6T-Znlk8Q,7025
183
189
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
184
190
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
185
191
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
@@ -187,6 +193,8 @@ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb
187
193
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
188
194
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
189
195
  evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
196
+ evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
197
+ evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=UdANz3YmCtV2YfGuEihTe3vpUTlIxeXBhIqGkKbTFdU,1956
190
198
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
191
199
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
192
200
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
@@ -195,22 +203,22 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
195
203
  evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
196
204
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
197
205
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
198
- evalscope/collections/evaluator.py,sha256=Ll-qLet04aEp1WxoCKAuvZVWEZuy1lS_D-vZIN3zSQQ,13425
206
+ evalscope/collections/evaluator.py,sha256=3sz_bL0HMFkxq3C-4P6rNGrnQolifVISI5sEpT3Bt90,15754
199
207
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
200
208
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
201
209
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
202
- evalscope/evaluator/evaluator.py,sha256=M1JrsoZZ5OvcZfzgLrNSMtbbz5gvvCd0GwJArJQV0lk,19797
210
+ evalscope/evaluator/evaluator.py,sha256=QzTFXiv_WdPpWTB3PgBNIz9KS_Rxu-fWDvoUpML23aA,21651
203
211
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
204
212
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
205
213
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
206
214
  evalscope/metrics/__init__.py,sha256=y1sdj5FBKYW1q5kLC6QREzoITHwstJRUdji6p0X5aAE,1363
207
- evalscope/metrics/llm_judge.py,sha256=MjyTC-xiSThk8Rd4IdUbsCXeeikoOORv6wt8H7SW8s4,4008
215
+ evalscope/metrics/llm_judge.py,sha256=qYHsoBz-zXjL57Czl9CaPcyJT5SZr05giv5Q9SFK3cY,4000
208
216
  evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
209
217
  evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
210
218
  evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
211
- evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
219
+ evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
212
220
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
213
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
221
+ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=m7v8ZysO9zCuyThEoGTe5QNVt2GsKMgZpH6du1FQCvg,12110
214
222
  evalscope/metrics/t2v_metrics/__init__.py,sha256=GBxgKTPVy_qhW_F3M4Oi6QMWhdAi4PqGX5w3t6Tueho,1783
215
223
  evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
216
224
  evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
@@ -318,19 +326,19 @@ evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,630
318
326
  evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
319
327
  evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
320
328
  evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
321
- evalscope/models/adapters/chat_adapter.py,sha256=HD1jAKlAv5KRjzB0s21E4rTEIhryZhZHMpSctF9xrN8,7306
329
+ evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
322
330
  evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
323
331
  evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
324
- evalscope/models/adapters/server_adapter.py,sha256=5kH1yDAjETogR7aOdnCEueYE1bREI40OdXdBiJpMdIM,6734
332
+ evalscope/models/adapters/server_adapter.py,sha256=d-0ne7ymWXmvKf_ypJ0093RNwplZJwhvU2xRwc8rt70,6581
325
333
  evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
326
334
  evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
327
335
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
328
336
  evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
329
337
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
- evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
331
- evalscope/perf/benchmark.py,sha256=nv7gtCkeKnLKQQiKM4G0MYO2ambcuwsbx67OgEQG0nM,7917
338
+ evalscope/perf/arguments.py,sha256=5dTtaBR9BIobaKkX1Xj-mphHDG4uugnGaVOvWpLfN04,10714
339
+ evalscope/perf/benchmark.py,sha256=eGnxMLQXSYBGRJS4tS8geSJAirnuWo35M4orlRZzei8,7847
332
340
  evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
333
- evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
341
+ evalscope/perf/main.py,sha256=clHzkQNmv7wv-OWkuNGDQ-8YoLUCWxARIX-Eisinpms,3096
334
342
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
335
343
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
336
344
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -349,11 +357,12 @@ evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANB
349
357
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
350
358
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
351
359
  evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
352
- evalscope/perf/utils/benchmark_util.py,sha256=CftjnxYA7d1aeAL_iuyXcJPwCL5A8zWGZSkNtjrMyW8,6309
353
- evalscope/perf/utils/db_util.py,sha256=efz6qQtMIYAIpG0sAEjLwuzTHBUiuzAV1n7_DCGrN5o,9461
360
+ evalscope/perf/utils/benchmark_util.py,sha256=PcRTeKlEIslBw0zKVS2mFg6GgJ6J8m1f2-gAaEBeiHI,7236
361
+ evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
354
362
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
355
363
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
356
- evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
364
+ evalscope/perf/utils/log_utils.py,sha256=Xm5A8g8BaozaI_0TaPzr2aAxUBCCf-w7II-FcifrIYg,1503
365
+ evalscope/perf/utils/rich_display.py,sha256=SavP2L44UwN58ZUGR2W1wxM4h4F1iyPa90HhT-Ypkzs,8125
357
366
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
358
367
  evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
359
368
  evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -376,11 +385,11 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
376
385
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
377
386
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
378
387
  evalscope/report/__init__.py,sha256=iLNqx7CnHSHQmOBqWUK_vt2VIjnvGslJTqn--7B4y_s,316
379
- evalscope/report/app.py,sha256=8pcQi5oYAYa9hXoMoMUNfy9jSvSR9DDiXyLcyPd9AmA,28459
388
+ evalscope/report/app.py,sha256=FxNpiEmbpH_B7D5SYN42idGsyOgkgFrLzScOVrwL3SI,28998
380
389
  evalscope/report/app_arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
381
390
  evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
382
391
  evalscope/report/generator.py,sha256=q9aHWNjQgvutAKtpjfWOpfu5zNFdnXilO9OqBqt_Phg,3612
383
- evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
392
+ evalscope/report/utils.py,sha256=uu-rAzoN6ZIlv52IDWSZCcmNVY3DscNo2f9H9-gjZHY,4602
384
393
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
385
394
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
386
395
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -413,13 +422,14 @@ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2ee
413
422
  evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
414
423
  evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
415
424
  evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
416
- evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
425
+ evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
417
426
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
418
427
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
419
428
  evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
420
429
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
421
430
  evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
422
431
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
432
+ evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
423
433
  evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
424
434
  evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
425
435
  evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
@@ -429,13 +439,13 @@ evalscope/utils/utils.py,sha256=hP_ntROFsZ-zaNVpJtT2prNo8iX-UAKfRtdxbLtPJng,1110
429
439
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
430
440
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
431
441
  tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
432
- tests/aigc/test_t2i.py,sha256=_M3WxY5ruBM4RD7rYHhgizcIhH-ny5XD9M16Ayl3UPk,2619
442
+ tests/aigc/test_t2i.py,sha256=BcdS3OMypWnraXF4Cq3DhDVRpZq0qo9_0Qpyg54B7FY,2627
433
443
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
434
- tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
435
- tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
436
- tests/cli/test_run.py,sha256=4B-6sOyotK3omirZWWyg7-CcnUSeZjiaU3aXHr0hH_Y,16804
444
+ tests/cli/test_all.py,sha256=O3lXwOV7A0f0rmltofrjpphnshjNtaZC6NUPG-wsQjg,4082
445
+ tests/cli/test_collection.py,sha256=_11mSCWLaiCgheA3uguv6uI3CxqaHUKVwzS6T5BGmxs,4145
446
+ tests/cli/test_run.py,sha256=FTFiAb8Ge5raB1aa0Nzw8DPjFLyAlLfXHRQVIWjvvGE,17798
437
447
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
438
- tests/perf/test_perf.py,sha256=8K5tGlWwOpYWnJ0GaCpqSw9zPOiM8fEKJaDil2mpTSQ,3831
448
+ tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
439
449
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
440
450
  tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
441
451
  tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
@@ -446,9 +456,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
446
456
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
447
457
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
448
458
  tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
449
- evalscope-0.15.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
450
- evalscope-0.15.0.dist-info/METADATA,sha256=MLn0s_L7s0oeQPWL1XuhihDAFJnzLdVTvdrep-9Bgag,34053
451
- evalscope-0.15.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
452
- evalscope-0.15.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
453
- evalscope-0.15.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
454
- evalscope-0.15.0.dist-info/RECORD,,
459
+ evalscope-0.16.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
460
+ evalscope-0.16.0.dist-info/METADATA,sha256=zX2L_cLxOjX-NNbiR40dmPOxUWyOH86zJycYjr4j5Po,35492
461
+ evalscope-0.16.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
462
+ evalscope-0.16.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
463
+ evalscope-0.16.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
464
+ evalscope-0.16.0.dist-info/RECORD,,
tests/aigc/test_t2i.py CHANGED
@@ -59,9 +59,9 @@ class TestRun(unittest.TestCase):
59
59
  },
60
60
  datasets=[
61
61
  'tifa160',
62
- 'genai_bench',
63
- 'evalmuse',
64
- 'hpdv2',
62
+ # 'genai_bench',
63
+ # 'evalmuse',
64
+ # 'hpdv2',
65
65
  ],
66
66
  dataset_args={
67
67
  'tifa160': {
@@ -81,7 +81,7 @@ class TestRun(unittest.TestCase):
81
81
  'num_inference_steps': 50,
82
82
  'guidance_scale': 7.5
83
83
  },
84
- use_cache='outputs/20250427_134122',
84
+ # use_cache='outputs/20250427_134122',
85
85
  )
86
86
 
87
87
  run_task(task_cfg=task_cfg)
tests/cli/test_all.py CHANGED
@@ -46,6 +46,9 @@ datasets=[
46
46
  'alpaca_eval',
47
47
  'arena_hard',
48
48
  'maritime_bench',
49
+ 'drop',
50
+ 'winogrande',
51
+ 'tool_bench',
49
52
  ]
50
53
 
51
54
  dataset_args={
@@ -78,7 +78,8 @@ class TestCollection(unittest.TestCase):
78
78
  'model_id': 'qwen2.5-7b-instruct',
79
79
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
80
  'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
- }
81
+ },
82
+ use_cache='outputs/20250519_114427'
82
83
  )
83
84
  res = run_task(task_cfg=task_cfg)
84
85
  print(res)
tests/cli/test_run.py CHANGED
@@ -73,6 +73,12 @@ class TestRun(unittest.TestCase):
73
73
  logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
74
74
  logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
75
75
 
76
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
77
+ def test_run_yaml_config(self):
78
+ from evalscope import run_task
79
+
80
+ run_task(task_cfg='examples/tasks/eval_native.yaml')
81
+
76
82
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
77
83
  def test_run_task(self):
78
84
  task_cfg = TaskConfig(
@@ -207,14 +213,18 @@ class TestRun(unittest.TestCase):
207
213
  from evalscope.config import TaskConfig
208
214
 
209
215
  task_cfg = TaskConfig(
210
- model='Qwen/Qwen2.5-0.5B-Instruct',
216
+ model='Qwen/Qwen3-1.7B',
211
217
  datasets=[
212
- # 'iquiz',
218
+ 'iquiz',
213
219
  # 'math_500',
214
220
  # 'aime24',
215
221
  # 'competition_math',
216
- 'mmlu',
222
+ # 'mmlu',
223
+ # 'simple_qa',
217
224
  ],
225
+ model_args={
226
+ 'device_map': 'auto',
227
+ },
218
228
  dataset_args={
219
229
  'competition_math': {
220
230
  'subset_list': ['Level 4', 'Level 5']
@@ -224,8 +234,16 @@ class TestRun(unittest.TestCase):
224
234
  'few_shot_num': 0
225
235
  },
226
236
  },
227
- limit=10,
228
- eval_batch_size=10,
237
+ limit=5,
238
+ eval_batch_size=5,
239
+ generation_config={
240
+ 'max_new_tokens': 1000, # 最大生成token数,建议设置为较大值避免输出截断
241
+ 'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
242
+ 'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
243
+ 'top_k': 20, # top-k采样 (qwen 报告推荐值)
244
+ 'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
245
+ },
246
+ judge_strategy=JudgeStrategy.AUTO,
229
247
  )
230
248
 
231
249
  run_task(task_cfg=task_cfg)
@@ -269,7 +287,7 @@ class TestRun(unittest.TestCase):
269
287
  datasets=[
270
288
  # 'iquiz',
271
289
  # 'ifeval',
272
- 'mmlu',
290
+ # 'mmlu',
273
291
  # 'mmlu_pro',
274
292
  # 'musr',
275
293
  # 'process_bench',
@@ -287,10 +305,13 @@ class TestRun(unittest.TestCase):
287
305
  # 'ceval',
288
306
  # 'hellaswag',
289
307
  # 'general_mcq',
290
- # 'general_qa'
308
+ # 'general_qa',
291
309
  # 'super_gpqa',
292
310
  # 'mmlu_redux',
293
- # 'maritime_bench'
311
+ # 'maritime_bench',
312
+ # 'drop',
313
+ # 'winogrande',
314
+ 'tool_bench',
294
315
  ],
295
316
  dataset_args={
296
317
  'mmlu': {
@@ -356,14 +377,16 @@ class TestRun(unittest.TestCase):
356
377
  },
357
378
  },
358
379
  eval_batch_size=32,
359
- limit=15,
380
+ limit=10,
360
381
  debug=True,
361
382
  stream=False,
362
383
  generation_config={
363
384
  'temperature': 0,
364
385
  'n': 1,
365
386
  'max_tokens': 4096,
366
- }
387
+ },
388
+ # ignore_errors=True,
389
+ use_cache='outputs/20250519_142106'
367
390
  )
368
391
 
369
392
  run_task(task_cfg=task_cfg)
@@ -416,12 +439,12 @@ class TestRun(unittest.TestCase):
416
439
  # 'gsm8k'
417
440
  # 'truthful_qa',
418
441
  # 'simple_qa',
419
- # 'chinese_simpleqa',
442
+ 'chinese_simpleqa',
420
443
  # 'live_code_bench',
421
444
  # 'humaneval',
422
445
  # 'general_qa',
423
446
  # 'alpaca_eval',
424
- 'arena_hard'
447
+ # 'arena_hard'
425
448
  ],
426
449
  dataset_args={
427
450
  'competition_math': {
@@ -447,7 +470,7 @@ class TestRun(unittest.TestCase):
447
470
  ]
448
471
  },
449
472
  },
450
- eval_batch_size=5,
473
+ eval_batch_size=10,
451
474
  limit=10,
452
475
  judge_strategy=JudgeStrategy.AUTO,
453
476
  judge_worker_num=5,
@@ -468,7 +491,7 @@ class TestRun(unittest.TestCase):
468
491
  },
469
492
  timeout=60000,
470
493
  stream=True,
471
- # use_cache='outputs/20250320_143658'
494
+ use_cache='outputs/20250519_142551'
472
495
  )
473
496
 
474
497
  run_task(task_cfg=task_cfg)
tests/perf/test_perf.py CHANGED
@@ -103,7 +103,7 @@ class TestPerf(unittest.TestCase):
103
103
  from evalscope.perf.arguments import Arguments
104
104
  task_cfg = Arguments(
105
105
  parallel=20,
106
- model='Qwen2.5-0.5B-Instruct',
106
+ model='Qwen3-1.7B',
107
107
  url='http://127.0.0.1:8801/v1/completions',
108
108
  api='openai',
109
109
  dataset='random',
@@ -117,8 +117,33 @@ class TestPerf(unittest.TestCase):
117
117
  seed=None,
118
118
  extra_args={'ignore_eos': True}
119
119
  )
120
- run_perf_benchmark(task_cfg)
120
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
121
+ print(metrics_result)
122
+ print(percentile_result)
121
123
 
124
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
125
+ def test_run_perf_multi_parallel(self):
126
+ from evalscope.perf.arguments import Arguments
127
+ task_cfg = Arguments(
128
+ parallel=[1, 2],
129
+ number=[2, 5],
130
+ model='qwen2.5-7b-instruct',
131
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
132
+ api_key=env.get('DASHSCOPE_API_KEY'),
133
+ api='openai',
134
+ dataset='random',
135
+ min_tokens=100,
136
+ max_tokens=100,
137
+ prefix_length=0,
138
+ min_prompt_length=1024,
139
+ max_prompt_length=1024,
140
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
141
+ seed=None,
142
+ extra_args={'ignore_eos': True}
143
+ )
144
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
145
+ print(metrics_result)
146
+ print(percentile_result)
122
147
 
123
148
  if __name__ == '__main__':
124
149
  unittest.main(buffer=False)