evalscope 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (58) hide show
  1. evalscope/arguments.py +2 -0
  2. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  3. evalscope/benchmarks/bbh/bbh_adapter.py +0 -5
  4. evalscope/benchmarks/benchmark.py +3 -1
  5. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -17
  6. evalscope/benchmarks/data_adapter.py +71 -18
  7. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +6 -10
  8. evalscope/benchmarks/general_qa/general_qa_adapter.py +4 -5
  9. evalscope/benchmarks/gpqa/gpqa_adapter.py +1 -1
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +1 -1
  11. evalscope/benchmarks/ifeval/ifeval_adapter.py +1 -1
  12. evalscope/benchmarks/math_500/math_500_adapter.py +10 -1
  13. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +16 -32
  14. evalscope/benchmarks/musr/__init__.py +0 -0
  15. evalscope/benchmarks/musr/musr_adapter.py +68 -0
  16. evalscope/benchmarks/process_bench/__init__.py +0 -0
  17. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  18. evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
  19. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -1
  20. evalscope/cli/start_app.py +4 -1
  21. evalscope/cli/start_eval.py +4 -3
  22. evalscope/cli/start_perf.py +4 -2
  23. evalscope/collections/evaluator.py +6 -0
  24. evalscope/config.py +3 -1
  25. evalscope/evaluator/evaluator.py +3 -1
  26. evalscope/metrics/__init__.py +2 -1
  27. evalscope/metrics/metrics.py +23 -2
  28. evalscope/models/base_adapter.py +7 -1
  29. evalscope/models/chat_adapter.py +1 -1
  30. evalscope/models/local_model.py +3 -2
  31. evalscope/models/server_adapter.py +79 -28
  32. evalscope/perf/__init__.py +0 -1
  33. evalscope/perf/arguments.py +5 -1
  34. evalscope/perf/http_client.py +2 -2
  35. evalscope/perf/plugin/api/openai_api.py +11 -1
  36. evalscope/perf/utils/benchmark_util.py +6 -2
  37. evalscope/report/app.py +12 -8
  38. evalscope/run.py +1 -1
  39. evalscope/third_party/thinkbench/__init__.py +3 -0
  40. evalscope/third_party/thinkbench/eval.py +264 -0
  41. evalscope/third_party/thinkbench/infer.py +100 -0
  42. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  43. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  44. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  45. evalscope/third_party/thinkbench/tools/llm.py +47 -0
  46. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  47. evalscope/utils/model_utils.py +17 -1
  48. evalscope/utils/utils.py +45 -45
  49. evalscope/version.py +2 -2
  50. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/METADATA +9 -4
  51. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/RECORD +58 -44
  52. tests/cli/test_run.py +27 -15
  53. /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
  54. /evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +0 -0
  55. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
  56. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
  57. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
  58. {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.11.0
3
+ Version: 0.12.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -21,6 +21,7 @@ Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
22
  Requires-Dist: datasets<=3.2.0,>=3.0.0
23
23
  Requires-Dist: editdistance
24
+ Requires-Dist: immutabledict
24
25
  Requires-Dist: jieba
25
26
  Requires-Dist: jsonlines
26
27
  Requires-Dist: langdetect
@@ -58,6 +59,7 @@ Requires-Dist: accelerate; extra == "all"
58
59
  Requires-Dist: cachetools; extra == "all"
59
60
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
60
61
  Requires-Dist: editdistance; extra == "all"
62
+ Requires-Dist: immutabledict; extra == "all"
61
63
  Requires-Dist: jieba; extra == "all"
62
64
  Requires-Dist: jsonlines; extra == "all"
63
65
  Requires-Dist: langdetect; extra == "all"
@@ -101,10 +103,10 @@ Requires-Dist: sse-starlette; extra == "all"
101
103
  Requires-Dist: transformers; extra == "all"
102
104
  Requires-Dist: unicorn; extra == "all"
103
105
  Requires-Dist: gradio==5.4.0; extra == "all"
104
- Requires-Dist: plotly>=5.23.0; extra == "all"
106
+ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
105
107
  Provides-Extra: app
106
108
  Requires-Dist: gradio==5.4.0; extra == "app"
107
- Requires-Dist: plotly>=5.23.0; extra == "app"
109
+ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
108
110
  Provides-Extra: inner
109
111
  Requires-Dist: absl-py; extra == "inner"
110
112
  Requires-Dist: accelerate; extra == "inner"
@@ -223,6 +225,9 @@ Please scan the QR code below to join our community groups:
223
225
 
224
226
 
225
227
  ## 🎉 News
228
+ - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
229
+ - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
230
+ - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
226
231
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
227
232
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
228
233
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
@@ -460,7 +465,7 @@ Then, you can use the following command to evaluate the model API service:
460
465
  ```shell
461
466
  evalscope eval \
462
467
  --model qwen2.5 \
463
- --api-url http://127.0.0.1:8801/v1/chat/completions \
468
+ --api-url http://127.0.0.1:8801/v1 \
464
469
  --api-key EMPTY \
465
470
  --eval-type service \
466
471
  --datasets gsm8k \
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=r8gOMX6i8dWMl_WXLsBdHla7cuauBAyv9apky9VxLsE,4598
3
- evalscope/config.py,sha256=D7C_K0f0xsfzFUSNSJJUTz3n9tmA6zLDbf8pZ_9ltpw,8600
2
+ evalscope/arguments.py,sha256=lYxhmZVs-dUz8q9cpwjoe-HuwglkkgxiSaluuXlAmAc,4814
3
+ evalscope/config.py,sha256=BZv7maQTbxXkb2WzdeGGQr0U01_TXy-Q7PujOiPJ4D8,8703
4
4
  evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
5
- evalscope/run.py,sha256=qfMqVWlUiXEiIJ665p3-IYWknhIeNZkCJe3Yn07Y74U,5692
5
+ evalscope/run.py,sha256=zRdBJEYdQ6JzH94eA7gfkzFAvsn3UFwdrvX_snaqGNU,5702
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=h6YAZAgeAreWmKtpfr4D6BEvnWZxb1bka9hrpYOO0l8,119
8
+ evalscope/version.py,sha256=RDE_Gbn1y54qtXxjxbZOTLDFSkq__2Zy3rAOwyVrvPs,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -56,15 +56,16 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
- evalscope/benchmarks/benchmark.py,sha256=IY2xYmNR58aYnZK7rnUDONWiLQopo_ZifGS2SfN2L-Q,2422
60
- evalscope/benchmarks/data_adapter.py,sha256=xCBvJe4ubgpP1J8ElcWAJwF6B5CSrBEv_uMwQzlUaLY,12540
61
- evalscope/benchmarks/aime24/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
- evalscope/benchmarks/aime24/aime24_adapter.py,sha256=FYH8NsT1nis3VoBMzRM_ueOsGNXjOKZCa6J_wpUM3RQ,1772
59
+ evalscope/benchmarks/benchmark.py,sha256=Kaes5Bg9_bvFO99-JztNlv_TPg4jH9vMYvnMcb1C_G8,2507
60
+ evalscope/benchmarks/data_adapter.py,sha256=e4mtvzlC8ehQ0N4C5PAGJFv5N9Y42WT-OklwaV-Ex1Y,15239
61
+ evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
+ evalscope/benchmarks/aime/aime24_adapter.py,sha256=FYH8NsT1nis3VoBMzRM_ueOsGNXjOKZCa6J_wpUM3RQ,1772
63
+ evalscope/benchmarks/aime/aime25_adapter.py,sha256=bws4dajr5xuMDvuTluDb80oBYUTUlu_geKvmnNO3_OQ,1766
63
64
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
64
65
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
65
66
  evalscope/benchmarks/arc/arc_adapter.py,sha256=vfwAy01LA141qn1lsSyZmEIGWbbhOCRMOGoSM-K2z6M,6490
66
67
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
67
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=37wY3r1qW5qdjyKF-8n7UIM0IVcpaQugMb5Rkjbppxg,8524
68
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=cep-Ws9Tozju6JWls1-oz3lKYqTL8q8Cee_d-d3cLIo,8407
68
69
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
69
70
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
70
71
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -101,19 +102,19 @@ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=1RmhI0SNxHK-Fz-iTIR76zeBRDLlm
101
102
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
102
103
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
103
104
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
104
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=i0E4TNajMVcWT8lc5haIjKvdmHuI5qzgpssIm5Fw7bs,7413
105
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=_vGkfgP5ZnQh3AlbJqycQOL_gQHayazMYFzHVo2e5O8,6902
105
106
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
107
  evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=o3Q6ke-RLx4qUbF5FgASZogv3-kCJ6qpK43F_LARU3Y,2496
107
108
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=vDHgsWpsIZQWNadl3mI8M3rDKkvPM2N2KAkW-8aeOHY,5130
109
+ evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=M-PocYW4pkGtKOKvFZW-bIoztcGvmHn5Gf5o7F71xCg,5248
109
110
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
110
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=fu14ZzGYyg2MEdJbxZGBoIbais6xA9Um2BEAJTvBZZM,3823
111
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=_t2ZNsQzY5AuOOBXkEVNGIB3pZgLKQmw7-5gEqR_Z_k,3848
111
112
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
113
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
113
- evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=tiy8Cn1ZmNKjVg8lqNAxWBbsKp8h0uiDNpWuHfcID0A,4689
114
+ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=1zI6GWfvPiKaZg39N7pSFw2R-GpbrjEo-11K_waq6Dg,4686
114
115
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
115
116
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
116
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=4qtMX_SfqkXRMgGLOA6tNGMK9EkITWbjLlJT9gWbT20,10664
117
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=X7fu2mx911Al-7a6j-mJQ3vqTb0cN0u7FoJTrNf6AN4,10661
117
118
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
118
119
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
119
120
  evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=qArX2umdrYJZkDA9i3XGBGljCton99v5Yss9be9iZYw,6269
@@ -121,7 +122,7 @@ evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0
121
122
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
122
123
  evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=onacZB_6SF9239Ly-U70__WYsinS9iWpnf3oiYMNxKc,5164
123
124
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=3HsAdNj5JJGCFA17sPXi-59yv-pfcB0UeXKdY_mQcwU,2015
125
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=zmN69NDhBR3NJak1cB0z3DqPMuoAvqADWMapQPnvGLs,2025
125
126
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
126
127
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
127
128
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
@@ -129,13 +130,18 @@ evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxg
129
130
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
130
131
  evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=nv4mzKOPp1YPcr6e7daZuZyQ3jRNNG6PUzi38REuwSk,2356
131
132
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
- evalscope/benchmarks/math_500/math_500_adapter.py,sha256=mBzsllop5sTHw-uK04FjhEWDiEDjDaNUFDUBIVN7Xgg,1742
133
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=OO3Jx1WuyEMfd4R5znG9_O5ln_SbVVGB5u1bTjiuWaU,2104
133
134
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
134
135
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
135
136
  evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=pmT1y9dbWJcZK3U6hkXa3-lBDABx7DhQ7oHc3O-Nkg0,11769
136
137
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
137
138
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=Fdrj26MfYmPzio2tI23WTcofrwD69_m41mkVpvlxzVU,4815
139
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=73TLdWlEAulZNA0ZMLDQnaXs435vG-gD89yjURjsjpo,4111
140
+ evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
+ evalscope/benchmarks/musr/musr_adapter.py,sha256=D-CZMTr9Ld_tJxZdCDPZQxDX86BgJqKWCyy5-tlcONY,2343
142
+ evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
+ evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
144
+ evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=YavcEDpnURVV0gCWTXDKq81CyEOgn8ASaVQu2h765to,3389
139
145
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
140
146
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
141
147
  evalscope/benchmarks/race/race_adapter.py,sha256=dC9I-3T9UFh2OVpmWKRmSszPOlFZAZ40xOPa4zN3daI,6661
@@ -146,27 +152,27 @@ evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI
146
152
  evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=GVuJT-Xz4ugVtcUSTRxcBgViHVowcqJf3yVsotcZoZI,5062
147
153
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
148
154
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
149
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=6rT1zuQh0nLuYymcchO-cMP98EY0vWizbfTfnUERWgo,12905
155
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=MYMLpIEv3oqRiOgAydqM0ZlzpvbzpCymOKUbca98yxo,12915
150
156
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
151
157
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
152
158
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
153
- evalscope/cli/start_app.py,sha256=_NTmCd15tZOROAnPacGWirMS4OXHrL3n2eZj1kokpks,758
154
- evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
155
- evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
159
+ evalscope/cli/start_app.py,sha256=WTbba_Iitz1jkQ5n6KHRH-i3U7qJIM7iCi4a9roWjaA,808
160
+ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,775
161
+ evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
156
162
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
157
163
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
158
- evalscope/collections/evaluator.py,sha256=FJx3KGdLi0-TIqWC_067HEmA4P298BKdwHIrbcai46M,12065
164
+ evalscope/collections/evaluator.py,sha256=1bz2jEgOlv7qHeCgkFCtd1MPWhMa6XnZfP4XJBhTnUc,12321
159
165
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
160
166
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
161
167
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
162
- evalscope/evaluator/evaluator.py,sha256=E0NiP5O56WbF8eiUmw9IY2ouotRog9H-2SRyTzZld0I,17569
168
+ evalscope/evaluator/evaluator.py,sha256=VIiw1eI46UOsFWNd7schD4ah_Q5ll0crl2sRmGIRmig,17649
163
169
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
164
170
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
165
171
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
166
- evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
172
+ evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
167
173
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
168
174
  evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
169
- evalscope/metrics/metrics.py,sha256=r4FHyEvvFhMu0vAHBw-ByFefObDBC3DQdr53klSk6Wk,13325
175
+ evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
170
176
  evalscope/metrics/named_metrics.py,sha256=SeBXmgWyK4y4tKiGKro3k-CZU1OShuKe6qxwpT3tizY,1313
171
177
  evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
172
178
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
@@ -174,20 +180,20 @@ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN
174
180
  evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
175
181
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
176
182
  evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
177
- evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
178
- evalscope/models/chat_adapter.py,sha256=nOrNDuvuNKkTcW9zNcR_EIqbzkqK5PFws-5YsSxBR9E,6120
183
+ evalscope/models/base_adapter.py,sha256=04VK4A5L0naOllBW9fw03GduvBUNgStliyFBTZKY0xU,2297
184
+ evalscope/models/chat_adapter.py,sha256=eji2HCTjRed7K4JRHAmLLwyliPBsEgYbUUY0lJ5-OAY,6126
179
185
  evalscope/models/choice_adapter.py,sha256=jj_6KB1BAsvv4Yufn2bM2tCiLovFUum2368lseogmb8,8036
180
186
  evalscope/models/custom_adapter.py,sha256=Ed_MGEcZxKK4mkXTpUY4GXTsayprHzIEOC1L9gqwjf4,2284
181
- evalscope/models/local_model.py,sha256=s0YVX9Djqazusk7qzSpWQB76jGGuzJxqQlZzomsCFsk,2621
187
+ evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
182
188
  evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
183
- evalscope/models/server_adapter.py,sha256=iVJuUJlHGVGxnlrDMnbHZ8WQ4OR2HK5HrXH4obD2_cg,4173
189
+ evalscope/models/server_adapter.py,sha256=l_EI1jTaH1EBATKaH1USAdiYkezz7IYUQTwhURivXx0,5710
184
190
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
185
191
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
186
192
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
187
- evalscope/perf/__init__.py,sha256=rgSXzxIJ67yB_SLUdl4ljem2-ilB-Gw3640f4KWLO1k,51
188
- evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
193
+ evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
194
+ evalscope/perf/arguments.py,sha256=3az0usVSjz0CCcDxNkHFm4mMb8tw6cE3qIAnvhLxos4,9430
189
195
  evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
190
- evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
196
+ evalscope/perf/http_client.py,sha256=qHIhsSUXHbh5HGqW9JmTJs1O8yrIYVXzSOgXwWlqiyA,7109
191
197
  evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
192
198
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
193
199
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
@@ -195,7 +201,7 @@ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2m
195
201
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
196
202
  evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
197
203
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
198
- evalscope/perf/plugin/api/openai_api.py,sha256=JxQGlzAbM7MBWcr3MvWiAg6E4lqdQLfkk1qK0vUWvn8,6817
204
+ evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
199
205
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
200
206
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
201
207
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
@@ -206,7 +212,7 @@ evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1
206
212
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
207
213
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
208
214
  evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0awWiXEIkCI,1212
209
- evalscope/perf/utils/benchmark_util.py,sha256=T_pXpSCwCNLJgfzgv3IO7kG61ghTLthVMsXZhBCGP_4,5541
215
+ evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
210
216
  evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
211
217
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
212
218
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
@@ -232,7 +238,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
232
238
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
233
239
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
234
240
  evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
235
- evalscope/report/app.py,sha256=adP1rVVOxYMbCTdopV3FKWBhUzB7t1AXcDOxW4Ct56g,26647
241
+ evalscope/report/app.py,sha256=lwyeDfxgzTbvy4TXtGYtkBegn33zcAuR0_776i5E2fw,26812
236
242
  evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
237
243
  evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
238
244
  evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
@@ -253,6 +259,14 @@ evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odT
253
259
  evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
254
260
  evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
255
261
  evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
262
+ evalscope/third_party/thinkbench/__init__.py,sha256=C0aSu71_dc1upUVkKmq2VgDd9plpRcYUdCE6BjUWJcA,110
263
+ evalscope/third_party/thinkbench/eval.py,sha256=P-nNryNKc6DMhD6LLTWnpIzYtfxLh67P9GArtq4jT_U,10988
264
+ evalscope/third_party/thinkbench/infer.py,sha256=-2PeyPurgZSwP_TRBNM-Xg3gwjIWRsn5oX0EpSt-0-A,3140
265
+ evalscope/third_party/thinkbench/resources/critique_template.txt,sha256=d4Egc-qH--4lG8X_EcmgymnuZgiCMbee1M5pt4HrRKA,535
266
+ evalscope/third_party/thinkbench/resources/reformat_template.txt,sha256=zTZyVAzmMBtAwI9lHly9EXsqX471OW-VTg538PDcB30,1775
267
+ evalscope/third_party/thinkbench/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
268
+ evalscope/third_party/thinkbench/tools/llm.py,sha256=PkuUUoVlyWXwOwPoMJIGIl0VQr0N83uNYGkA2rBjpe8,1333
269
+ evalscope/third_party/thinkbench/tools/utils.py,sha256=rDu2GVTK4ji9Yh9RLVksZqrfurQsSuN9GW3QCKJ60ng,401
256
270
  evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
257
271
  evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
258
272
  evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
@@ -269,13 +283,13 @@ evalscope/utils/chat_service.py,sha256=eZ8uyVeVFpXZo_uvRFyVhnFyJpL14zcn9UA6K4Ax5
269
283
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
270
284
  evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
271
285
  evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
272
- evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
273
- evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
286
+ evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
287
+ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
274
288
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
275
289
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
276
290
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
277
291
  tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
278
- tests/cli/test_run.py,sha256=gtId2SF1LlDCIn4S_WKRpAyTig_pWOhY8yto4P5B1EY,8303
292
+ tests/cli/test_run.py,sha256=VV6XTiNSuQiuw6j_jqPWKgCgouNYt8OFmJr-dFmMFDg,8759
279
293
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
280
294
  tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
281
295
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -288,9 +302,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
288
302
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
289
303
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
290
304
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
291
- evalscope-0.11.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
292
- evalscope-0.11.0.dist-info/METADATA,sha256=GL8Ybyby65DYg8jxjxzdcFYvXBhKzE7eRFIBRiJ0-hc,29584
293
- evalscope-0.11.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
294
- evalscope-0.11.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
295
- evalscope-0.11.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
296
- evalscope-0.11.0.dist-info/RECORD,,
305
+ evalscope-0.12.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
306
+ evalscope-0.12.0.dist-info/METADATA,sha256=u2yGTXt6DLWEklbCHuclmS4gpiu6AbdBrosLK8HUOmk,30499
307
+ evalscope-0.12.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
308
+ evalscope-0.12.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
309
+ evalscope-0.12.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
310
+ evalscope-0.12.0.dist-info/RECORD,,
tests/cli/test_run.py CHANGED
@@ -76,10 +76,10 @@ class TestRun(unittest.TestCase):
76
76
  # 'mmlu_pro',
77
77
  # 'bbh',
78
78
  # 'hellaswag',
79
- # 'gsm8k',
79
+ 'gsm8k',
80
80
  # 'arc',
81
81
  # 'race',
82
- 'ifeval',
82
+ # 'ifeval',
83
83
  # 'truthful_qa',
84
84
  # 'trivia_qa',
85
85
  ],
@@ -101,7 +101,8 @@ class TestRun(unittest.TestCase):
101
101
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
102
102
  'subset_list': [
103
103
  'example' # 评测数据集名称,上述 *_dev.csv 中的 *
104
- ]
104
+ ],
105
+ 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
105
106
  },
106
107
  'general_qa': {
107
108
  'local_path': 'custom_eval/text/qa', # 自定义数据集路径
@@ -111,7 +112,8 @@ class TestRun(unittest.TestCase):
111
112
  }
112
113
  },
113
114
  )
114
- run_task(task_cfg=task_cfg)
115
+ res = run_task(task_cfg=task_cfg)
116
+ print(res)
115
117
 
116
118
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
117
119
  def test_run_humaneval(self):
@@ -140,24 +142,26 @@ class TestRun(unittest.TestCase):
140
142
 
141
143
  task_cfg = TaskConfig(
142
144
  model='Qwen2.5-0.5B-Instruct',
143
- api_url='http://127.0.0.1:8801/v1/chat/completions',
145
+ api_url='http://127.0.0.1:8801/v1',
144
146
  api_key='EMPTY',
145
147
  eval_type=EvalType.SERVICE,
146
148
  datasets=[
147
- # 'iquiz',
149
+ 'iquiz',
148
150
  # 'ifeval',
149
151
  # 'mmlu',
150
152
  # 'mmlu_pro',
153
+ # 'musr',
154
+ # 'process_bench',
151
155
  # 'race',
152
156
  # 'trivia_qa',
153
157
  # 'cmmlu',
154
158
  # 'humaneval',
155
159
  # 'gsm8k',
156
160
  # 'bbh',
157
- 'competition_math',
158
- 'math_500',
159
- 'aime24',
160
- 'gpqa',
161
+ # 'competition_math',
162
+ # 'math_500',
163
+ # 'aime24',
164
+ # 'gpqa',
161
165
  # 'arc',
162
166
  # 'ceval',
163
167
  # 'hellaswag',
@@ -168,8 +172,8 @@ class TestRun(unittest.TestCase):
168
172
  'few_shot_num': 0
169
173
  },
170
174
  'mmlu_pro': {
171
- 'subset_list': ['math'],
172
- 'few_shot_num': 0
175
+ 'subset_list': ['math', 'health'],
176
+ 'few_shot_num': 4
173
177
  },
174
178
  'ceval': {
175
179
  'subset_list': [
@@ -194,15 +198,23 @@ class TestRun(unittest.TestCase):
194
198
  'competition_math': {
195
199
  'subset_list': ['Level 1']
196
200
  },
201
+ 'process_bench': {
202
+ 'subset_list': ['gsm8k'],
203
+ },
204
+ 'musr': {
205
+ 'subset_list': ['murder_mysteries']
206
+ },
197
207
  },
198
208
  eval_batch_size=5,
199
- limit=10,
209
+ limit=5,
200
210
  debug=True,
211
+ stream=True,
201
212
  generation_config={
202
213
  'temperature': 0.7,
203
- 'n': 5
214
+ 'n': 1,
215
+ 'max_tokens': 512,
204
216
  },
205
- use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525'
217
+ # use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525',
206
218
  )
207
219
 
208
220
  run_task(task_cfg=task_cfg)
File without changes
File without changes