evalscope 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -0
- evalscope/benchmarks/aime/aime25_adapter.py +49 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +0 -5
- evalscope/benchmarks/benchmark.py +3 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -17
- evalscope/benchmarks/data_adapter.py +71 -18
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +6 -10
- evalscope/benchmarks/general_qa/general_qa_adapter.py +4 -5
- evalscope/benchmarks/gpqa/gpqa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +10 -1
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +16 -32
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +68 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/critique_template.txt +13 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -1
- evalscope/cli/start_app.py +4 -1
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +4 -2
- evalscope/collections/evaluator.py +6 -0
- evalscope/config.py +3 -1
- evalscope/evaluator/evaluator.py +3 -1
- evalscope/metrics/__init__.py +2 -1
- evalscope/metrics/metrics.py +23 -2
- evalscope/models/base_adapter.py +7 -1
- evalscope/models/chat_adapter.py +1 -1
- evalscope/models/local_model.py +3 -2
- evalscope/models/server_adapter.py +79 -28
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +5 -1
- evalscope/perf/http_client.py +2 -2
- evalscope/perf/plugin/api/openai_api.py +11 -1
- evalscope/perf/utils/benchmark_util.py +6 -2
- evalscope/report/app.py +12 -8
- evalscope/run.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +264 -0
- evalscope/third_party/thinkbench/infer.py +100 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +47 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/utils/model_utils.py +17 -1
- evalscope/utils/utils.py +45 -45
- evalscope/version.py +2 -2
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/METADATA +9 -4
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/RECORD +58 -44
- tests/cli/test_run.py +27 -15
- /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
- /evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.11.0.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -21,6 +21,7 @@ Requires-Dist: accelerate
|
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
22
|
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
23
|
Requires-Dist: editdistance
|
|
24
|
+
Requires-Dist: immutabledict
|
|
24
25
|
Requires-Dist: jieba
|
|
25
26
|
Requires-Dist: jsonlines
|
|
26
27
|
Requires-Dist: langdetect
|
|
@@ -58,6 +59,7 @@ Requires-Dist: accelerate; extra == "all"
|
|
|
58
59
|
Requires-Dist: cachetools; extra == "all"
|
|
59
60
|
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
60
61
|
Requires-Dist: editdistance; extra == "all"
|
|
62
|
+
Requires-Dist: immutabledict; extra == "all"
|
|
61
63
|
Requires-Dist: jieba; extra == "all"
|
|
62
64
|
Requires-Dist: jsonlines; extra == "all"
|
|
63
65
|
Requires-Dist: langdetect; extra == "all"
|
|
@@ -101,10 +103,10 @@ Requires-Dist: sse-starlette; extra == "all"
|
|
|
101
103
|
Requires-Dist: transformers; extra == "all"
|
|
102
104
|
Requires-Dist: unicorn; extra == "all"
|
|
103
105
|
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
104
|
-
Requires-Dist: plotly
|
|
106
|
+
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
105
107
|
Provides-Extra: app
|
|
106
108
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
107
|
-
Requires-Dist: plotly
|
|
109
|
+
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
108
110
|
Provides-Extra: inner
|
|
109
111
|
Requires-Dist: absl-py; extra == "inner"
|
|
110
112
|
Requires-Dist: accelerate; extra == "inner"
|
|
@@ -223,6 +225,9 @@ Please scan the QR code below to join our community groups:
|
|
|
223
225
|
|
|
224
226
|
|
|
225
227
|
## 🎉 News
|
|
228
|
+
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
229
|
+
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
230
|
+
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
226
231
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
227
232
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
228
233
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
@@ -460,7 +465,7 @@ Then, you can use the following command to evaluate the model API service:
|
|
|
460
465
|
```shell
|
|
461
466
|
evalscope eval \
|
|
462
467
|
--model qwen2.5 \
|
|
463
|
-
--api-url http://127.0.0.1:8801/v1
|
|
468
|
+
--api-url http://127.0.0.1:8801/v1 \
|
|
464
469
|
--api-key EMPTY \
|
|
465
470
|
--eval-type service \
|
|
466
471
|
--datasets gsm8k \
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=lYxhmZVs-dUz8q9cpwjoe-HuwglkkgxiSaluuXlAmAc,4814
|
|
3
|
+
evalscope/config.py,sha256=BZv7maQTbxXkb2WzdeGGQr0U01_TXy-Q7PujOiPJ4D8,8703
|
|
4
4
|
evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=zRdBJEYdQ6JzH94eA7gfkzFAvsn3UFwdrvX_snaqGNU,5702
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=RDE_Gbn1y54qtXxjxbZOTLDFSkq__2Zy3rAOwyVrvPs,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,15 +56,16 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
61
|
-
evalscope/benchmarks/
|
|
62
|
-
evalscope/benchmarks/
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=Kaes5Bg9_bvFO99-JztNlv_TPg4jH9vMYvnMcb1C_G8,2507
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=e4mtvzlC8ehQ0N4C5PAGJFv5N9Y42WT-OklwaV-Ex1Y,15239
|
|
61
|
+
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
|
+
evalscope/benchmarks/aime/aime24_adapter.py,sha256=FYH8NsT1nis3VoBMzRM_ueOsGNXjOKZCa6J_wpUM3RQ,1772
|
|
63
|
+
evalscope/benchmarks/aime/aime25_adapter.py,sha256=bws4dajr5xuMDvuTluDb80oBYUTUlu_geKvmnNO3_OQ,1766
|
|
63
64
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
64
65
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
65
66
|
evalscope/benchmarks/arc/arc_adapter.py,sha256=vfwAy01LA141qn1lsSyZmEIGWbbhOCRMOGoSM-K2z6M,6490
|
|
66
67
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
67
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
68
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=cep-Ws9Tozju6JWls1-oz3lKYqTL8q8Cee_d-d3cLIo,8407
|
|
68
69
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
69
70
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
70
71
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -101,19 +102,19 @@ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=1RmhI0SNxHK-Fz-iTIR76zeBRDLlm
|
|
|
101
102
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
102
103
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
103
104
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
104
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
105
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=_vGkfgP5ZnQh3AlbJqycQOL_gQHayazMYFzHVo2e5O8,6902
|
|
105
106
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
106
107
|
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=o3Q6ke-RLx4qUbF5FgASZogv3-kCJ6qpK43F_LARU3Y,2496
|
|
107
108
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
|
-
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=
|
|
109
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=M-PocYW4pkGtKOKvFZW-bIoztcGvmHn5Gf5o7F71xCg,5248
|
|
109
110
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
110
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
111
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=_t2ZNsQzY5AuOOBXkEVNGIB3pZgLKQmw7-5gEqR_Z_k,3848
|
|
111
112
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
113
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
113
|
-
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=
|
|
114
|
+
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=1zI6GWfvPiKaZg39N7pSFw2R-GpbrjEo-11K_waq6Dg,4686
|
|
114
115
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
115
116
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
116
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
117
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=X7fu2mx911Al-7a6j-mJQ3vqTb0cN0u7FoJTrNf6AN4,10661
|
|
117
118
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
118
119
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
119
120
|
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=qArX2umdrYJZkDA9i3XGBGljCton99v5Yss9be9iZYw,6269
|
|
@@ -121,7 +122,7 @@ evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0
|
|
|
121
122
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
122
123
|
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=onacZB_6SF9239Ly-U70__WYsinS9iWpnf3oiYMNxKc,5164
|
|
123
124
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
125
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=zmN69NDhBR3NJak1cB0z3DqPMuoAvqADWMapQPnvGLs,2025
|
|
125
126
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
126
127
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
127
128
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
@@ -129,13 +130,18 @@ evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxg
|
|
|
129
130
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
130
131
|
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=nv4mzKOPp1YPcr6e7daZuZyQ3jRNNG6PUzi38REuwSk,2356
|
|
131
132
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
|
-
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=
|
|
133
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=OO3Jx1WuyEMfd4R5znG9_O5ln_SbVVGB5u1bTjiuWaU,2104
|
|
133
134
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
134
135
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
135
136
|
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=pmT1y9dbWJcZK3U6hkXa3-lBDABx7DhQ7oHc3O-Nkg0,11769
|
|
136
137
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
137
138
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
138
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
139
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=73TLdWlEAulZNA0ZMLDQnaXs435vG-gD89yjURjsjpo,4111
|
|
140
|
+
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
141
|
+
evalscope/benchmarks/musr/musr_adapter.py,sha256=D-CZMTr9Ld_tJxZdCDPZQxDX86BgJqKWCyy5-tlcONY,2343
|
|
142
|
+
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
143
|
+
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
144
|
+
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=YavcEDpnURVV0gCWTXDKq81CyEOgn8ASaVQu2h765to,3389
|
|
139
145
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
140
146
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
141
147
|
evalscope/benchmarks/race/race_adapter.py,sha256=dC9I-3T9UFh2OVpmWKRmSszPOlFZAZ40xOPa4zN3daI,6661
|
|
@@ -146,27 +152,27 @@ evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI
|
|
|
146
152
|
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=GVuJT-Xz4ugVtcUSTRxcBgViHVowcqJf3yVsotcZoZI,5062
|
|
147
153
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
148
154
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
149
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
155
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=MYMLpIEv3oqRiOgAydqM0ZlzpvbzpCymOKUbca98yxo,12915
|
|
150
156
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
151
157
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
152
158
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
153
|
-
evalscope/cli/start_app.py,sha256=
|
|
154
|
-
evalscope/cli/start_eval.py,sha256=
|
|
155
|
-
evalscope/cli/start_perf.py,sha256=
|
|
159
|
+
evalscope/cli/start_app.py,sha256=WTbba_Iitz1jkQ5n6KHRH-i3U7qJIM7iCi4a9roWjaA,808
|
|
160
|
+
evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,775
|
|
161
|
+
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
156
162
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
157
163
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
158
|
-
evalscope/collections/evaluator.py,sha256=
|
|
164
|
+
evalscope/collections/evaluator.py,sha256=1bz2jEgOlv7qHeCgkFCtd1MPWhMa6XnZfP4XJBhTnUc,12321
|
|
159
165
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
160
166
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
161
167
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
162
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
168
|
+
evalscope/evaluator/evaluator.py,sha256=VIiw1eI46UOsFWNd7schD4ah_Q5ll0crl2sRmGIRmig,17649
|
|
163
169
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
164
170
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
165
171
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
166
|
-
evalscope/metrics/__init__.py,sha256=
|
|
172
|
+
evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
|
|
167
173
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
168
174
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
169
|
-
evalscope/metrics/metrics.py,sha256=
|
|
175
|
+
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
170
176
|
evalscope/metrics/named_metrics.py,sha256=SeBXmgWyK4y4tKiGKro3k-CZU1OShuKe6qxwpT3tizY,1313
|
|
171
177
|
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
172
178
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
@@ -174,20 +180,20 @@ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN
|
|
|
174
180
|
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
|
|
175
181
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
176
182
|
evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
|
|
177
|
-
evalscope/models/base_adapter.py,sha256=
|
|
178
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
183
|
+
evalscope/models/base_adapter.py,sha256=04VK4A5L0naOllBW9fw03GduvBUNgStliyFBTZKY0xU,2297
|
|
184
|
+
evalscope/models/chat_adapter.py,sha256=eji2HCTjRed7K4JRHAmLLwyliPBsEgYbUUY0lJ5-OAY,6126
|
|
179
185
|
evalscope/models/choice_adapter.py,sha256=jj_6KB1BAsvv4Yufn2bM2tCiLovFUum2368lseogmb8,8036
|
|
180
186
|
evalscope/models/custom_adapter.py,sha256=Ed_MGEcZxKK4mkXTpUY4GXTsayprHzIEOC1L9gqwjf4,2284
|
|
181
|
-
evalscope/models/local_model.py,sha256=
|
|
187
|
+
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
182
188
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
183
|
-
evalscope/models/server_adapter.py,sha256=
|
|
189
|
+
evalscope/models/server_adapter.py,sha256=l_EI1jTaH1EBATKaH1USAdiYkezz7IYUQTwhURivXx0,5710
|
|
184
190
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
185
191
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
186
192
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
187
|
-
evalscope/perf/__init__.py,sha256=
|
|
188
|
-
evalscope/perf/arguments.py,sha256=
|
|
193
|
+
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
194
|
+
evalscope/perf/arguments.py,sha256=3az0usVSjz0CCcDxNkHFm4mMb8tw6cE3qIAnvhLxos4,9430
|
|
189
195
|
evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
|
|
190
|
-
evalscope/perf/http_client.py,sha256=
|
|
196
|
+
evalscope/perf/http_client.py,sha256=qHIhsSUXHbh5HGqW9JmTJs1O8yrIYVXzSOgXwWlqiyA,7109
|
|
191
197
|
evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
|
|
192
198
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
193
199
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
@@ -195,7 +201,7 @@ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2m
|
|
|
195
201
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
196
202
|
evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
|
|
197
203
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
198
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
204
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
|
|
199
205
|
evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
|
|
200
206
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
201
207
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
@@ -206,7 +212,7 @@ evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1
|
|
|
206
212
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
207
213
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
208
214
|
evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0awWiXEIkCI,1212
|
|
209
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
215
|
+
evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
|
|
210
216
|
evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
|
|
211
217
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
212
218
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
@@ -232,7 +238,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
232
238
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
233
239
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
234
240
|
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
235
|
-
evalscope/report/app.py,sha256=
|
|
241
|
+
evalscope/report/app.py,sha256=lwyeDfxgzTbvy4TXtGYtkBegn33zcAuR0_776i5E2fw,26812
|
|
236
242
|
evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
|
|
237
243
|
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
238
244
|
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
@@ -253,6 +259,14 @@ evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odT
|
|
|
253
259
|
evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
254
260
|
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
|
|
255
261
|
evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
|
|
262
|
+
evalscope/third_party/thinkbench/__init__.py,sha256=C0aSu71_dc1upUVkKmq2VgDd9plpRcYUdCE6BjUWJcA,110
|
|
263
|
+
evalscope/third_party/thinkbench/eval.py,sha256=P-nNryNKc6DMhD6LLTWnpIzYtfxLh67P9GArtq4jT_U,10988
|
|
264
|
+
evalscope/third_party/thinkbench/infer.py,sha256=-2PeyPurgZSwP_TRBNM-Xg3gwjIWRsn5oX0EpSt-0-A,3140
|
|
265
|
+
evalscope/third_party/thinkbench/resources/critique_template.txt,sha256=d4Egc-qH--4lG8X_EcmgymnuZgiCMbee1M5pt4HrRKA,535
|
|
266
|
+
evalscope/third_party/thinkbench/resources/reformat_template.txt,sha256=zTZyVAzmMBtAwI9lHly9EXsqX471OW-VTg538PDcB30,1775
|
|
267
|
+
evalscope/third_party/thinkbench/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
268
|
+
evalscope/third_party/thinkbench/tools/llm.py,sha256=PkuUUoVlyWXwOwPoMJIGIl0VQr0N83uNYGkA2rBjpe8,1333
|
|
269
|
+
evalscope/third_party/thinkbench/tools/utils.py,sha256=rDu2GVTK4ji9Yh9RLVksZqrfurQsSuN9GW3QCKJ60ng,401
|
|
256
270
|
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
257
271
|
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
258
272
|
evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
|
|
@@ -269,13 +283,13 @@ evalscope/utils/chat_service.py,sha256=eZ8uyVeVFpXZo_uvRFyVhnFyJpL14zcn9UA6K4Ax5
|
|
|
269
283
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
270
284
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
271
285
|
evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
|
|
272
|
-
evalscope/utils/model_utils.py,sha256=
|
|
273
|
-
evalscope/utils/utils.py,sha256=
|
|
286
|
+
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
287
|
+
evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
|
|
274
288
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
275
289
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
276
290
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
277
291
|
tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
|
|
278
|
-
tests/cli/test_run.py,sha256=
|
|
292
|
+
tests/cli/test_run.py,sha256=VV6XTiNSuQiuw6j_jqPWKgCgouNYt8OFmJr-dFmMFDg,8759
|
|
279
293
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
280
294
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
281
295
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -288,9 +302,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
288
302
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
289
303
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
290
304
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
291
|
-
evalscope-0.
|
|
292
|
-
evalscope-0.
|
|
293
|
-
evalscope-0.
|
|
294
|
-
evalscope-0.
|
|
295
|
-
evalscope-0.
|
|
296
|
-
evalscope-0.
|
|
305
|
+
evalscope-0.12.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
306
|
+
evalscope-0.12.0.dist-info/METADATA,sha256=u2yGTXt6DLWEklbCHuclmS4gpiu6AbdBrosLK8HUOmk,30499
|
|
307
|
+
evalscope-0.12.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
308
|
+
evalscope-0.12.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
309
|
+
evalscope-0.12.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
310
|
+
evalscope-0.12.0.dist-info/RECORD,,
|
tests/cli/test_run.py
CHANGED
|
@@ -76,10 +76,10 @@ class TestRun(unittest.TestCase):
|
|
|
76
76
|
# 'mmlu_pro',
|
|
77
77
|
# 'bbh',
|
|
78
78
|
# 'hellaswag',
|
|
79
|
-
|
|
79
|
+
'gsm8k',
|
|
80
80
|
# 'arc',
|
|
81
81
|
# 'race',
|
|
82
|
-
'ifeval',
|
|
82
|
+
# 'ifeval',
|
|
83
83
|
# 'truthful_qa',
|
|
84
84
|
# 'trivia_qa',
|
|
85
85
|
],
|
|
@@ -101,7 +101,8 @@ class TestRun(unittest.TestCase):
|
|
|
101
101
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
102
102
|
'subset_list': [
|
|
103
103
|
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
104
|
-
]
|
|
104
|
+
],
|
|
105
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
105
106
|
},
|
|
106
107
|
'general_qa': {
|
|
107
108
|
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
@@ -111,7 +112,8 @@ class TestRun(unittest.TestCase):
|
|
|
111
112
|
}
|
|
112
113
|
},
|
|
113
114
|
)
|
|
114
|
-
run_task(task_cfg=task_cfg)
|
|
115
|
+
res = run_task(task_cfg=task_cfg)
|
|
116
|
+
print(res)
|
|
115
117
|
|
|
116
118
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
117
119
|
def test_run_humaneval(self):
|
|
@@ -140,24 +142,26 @@ class TestRun(unittest.TestCase):
|
|
|
140
142
|
|
|
141
143
|
task_cfg = TaskConfig(
|
|
142
144
|
model='Qwen2.5-0.5B-Instruct',
|
|
143
|
-
api_url='http://127.0.0.1:8801/v1
|
|
145
|
+
api_url='http://127.0.0.1:8801/v1',
|
|
144
146
|
api_key='EMPTY',
|
|
145
147
|
eval_type=EvalType.SERVICE,
|
|
146
148
|
datasets=[
|
|
147
|
-
|
|
149
|
+
'iquiz',
|
|
148
150
|
# 'ifeval',
|
|
149
151
|
# 'mmlu',
|
|
150
152
|
# 'mmlu_pro',
|
|
153
|
+
# 'musr',
|
|
154
|
+
# 'process_bench',
|
|
151
155
|
# 'race',
|
|
152
156
|
# 'trivia_qa',
|
|
153
157
|
# 'cmmlu',
|
|
154
158
|
# 'humaneval',
|
|
155
159
|
# 'gsm8k',
|
|
156
160
|
# 'bbh',
|
|
157
|
-
'competition_math',
|
|
158
|
-
'math_500',
|
|
159
|
-
'aime24',
|
|
160
|
-
'gpqa',
|
|
161
|
+
# 'competition_math',
|
|
162
|
+
# 'math_500',
|
|
163
|
+
# 'aime24',
|
|
164
|
+
# 'gpqa',
|
|
161
165
|
# 'arc',
|
|
162
166
|
# 'ceval',
|
|
163
167
|
# 'hellaswag',
|
|
@@ -168,8 +172,8 @@ class TestRun(unittest.TestCase):
|
|
|
168
172
|
'few_shot_num': 0
|
|
169
173
|
},
|
|
170
174
|
'mmlu_pro': {
|
|
171
|
-
'subset_list': ['math'],
|
|
172
|
-
'few_shot_num':
|
|
175
|
+
'subset_list': ['math', 'health'],
|
|
176
|
+
'few_shot_num': 4
|
|
173
177
|
},
|
|
174
178
|
'ceval': {
|
|
175
179
|
'subset_list': [
|
|
@@ -194,15 +198,23 @@ class TestRun(unittest.TestCase):
|
|
|
194
198
|
'competition_math': {
|
|
195
199
|
'subset_list': ['Level 1']
|
|
196
200
|
},
|
|
201
|
+
'process_bench': {
|
|
202
|
+
'subset_list': ['gsm8k'],
|
|
203
|
+
},
|
|
204
|
+
'musr': {
|
|
205
|
+
'subset_list': ['murder_mysteries']
|
|
206
|
+
},
|
|
197
207
|
},
|
|
198
208
|
eval_batch_size=5,
|
|
199
|
-
limit=
|
|
209
|
+
limit=5,
|
|
200
210
|
debug=True,
|
|
211
|
+
stream=True,
|
|
201
212
|
generation_config={
|
|
202
213
|
'temperature': 0.7,
|
|
203
|
-
'n':
|
|
214
|
+
'n': 1,
|
|
215
|
+
'max_tokens': 512,
|
|
204
216
|
},
|
|
205
|
-
use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525'
|
|
217
|
+
# use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525',
|
|
206
218
|
)
|
|
207
219
|
|
|
208
220
|
run_task(task_cfg=task_cfg)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|