evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
- evalscope/benchmarks/ifeval/instructions.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/cli/start_app.py +3 -2
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -47
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +298 -96
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
evalscope/utils/chat_service.py
CHANGED
|
@@ -174,7 +174,7 @@ class ChatService:
|
|
|
174
174
|
)
|
|
175
175
|
|
|
176
176
|
def _prepare_text_inputs(self, request: TextCompletionRequest):
|
|
177
|
-
inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=
|
|
177
|
+
inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=False).to(self.device)
|
|
178
178
|
prompt_tokens = len(inputs['input_ids'][0])
|
|
179
179
|
return inputs, prompt_tokens
|
|
180
180
|
|
|
@@ -204,7 +204,7 @@ class ChatService:
|
|
|
204
204
|
def _prepare_chat_inputs(self, request: ChatCompletionRequest):
|
|
205
205
|
formatted_prompt = self.tokenizer.apply_chat_template(
|
|
206
206
|
request.messages, tokenize=False, add_generation_prompt=True)
|
|
207
|
-
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=
|
|
207
|
+
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
|
|
208
208
|
prompt_tokens = len(inputs['input_ids'][0])
|
|
209
209
|
return formatted_prompt, inputs, prompt_tokens
|
|
210
210
|
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -135,7 +135,7 @@ def dict_to_yaml(d: dict, yaml_file: str):
|
|
|
135
135
|
Dump dict to yaml file.
|
|
136
136
|
"""
|
|
137
137
|
with open(yaml_file, 'w') as f:
|
|
138
|
-
yaml.dump(d, f, default_flow_style=False)
|
|
138
|
+
yaml.dump(d, f, default_flow_style=False, allow_unicode=True)
|
|
139
139
|
|
|
140
140
|
|
|
141
141
|
def json_to_dict(json_file) -> dict:
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -19,10 +19,12 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: absl-py
|
|
20
20
|
Requires-Dist: accelerate
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
|
-
Requires-Dist: datasets<=3.0
|
|
22
|
+
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
23
|
Requires-Dist: editdistance
|
|
24
24
|
Requires-Dist: jieba
|
|
25
25
|
Requires-Dist: jsonlines
|
|
26
|
+
Requires-Dist: langdetect
|
|
27
|
+
Requires-Dist: latex2sympy2
|
|
26
28
|
Requires-Dist: matplotlib
|
|
27
29
|
Requires-Dist: modelscope[framework]
|
|
28
30
|
Requires-Dist: nltk>=3.9
|
|
@@ -42,20 +44,24 @@ Requires-Dist: scikit-learn
|
|
|
42
44
|
Requires-Dist: seaborn
|
|
43
45
|
Requires-Dist: sentencepiece
|
|
44
46
|
Requires-Dist: simple-ddl-parser
|
|
47
|
+
Requires-Dist: sympy
|
|
45
48
|
Requires-Dist: tabulate
|
|
46
49
|
Requires-Dist: tiktoken
|
|
47
50
|
Requires-Dist: torch
|
|
48
51
|
Requires-Dist: tqdm
|
|
49
52
|
Requires-Dist: transformers>=4.33
|
|
50
53
|
Requires-Dist: transformers-stream-generator
|
|
54
|
+
Requires-Dist: word2number
|
|
51
55
|
Provides-Extra: all
|
|
52
56
|
Requires-Dist: absl-py; extra == "all"
|
|
53
57
|
Requires-Dist: accelerate; extra == "all"
|
|
54
58
|
Requires-Dist: cachetools; extra == "all"
|
|
55
|
-
Requires-Dist: datasets<=3.0
|
|
59
|
+
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
56
60
|
Requires-Dist: editdistance; extra == "all"
|
|
57
61
|
Requires-Dist: jieba; extra == "all"
|
|
58
62
|
Requires-Dist: jsonlines; extra == "all"
|
|
63
|
+
Requires-Dist: langdetect; extra == "all"
|
|
64
|
+
Requires-Dist: latex2sympy2; extra == "all"
|
|
59
65
|
Requires-Dist: matplotlib; extra == "all"
|
|
60
66
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
61
67
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -75,12 +81,14 @@ Requires-Dist: scikit-learn; extra == "all"
|
|
|
75
81
|
Requires-Dist: seaborn; extra == "all"
|
|
76
82
|
Requires-Dist: sentencepiece; extra == "all"
|
|
77
83
|
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
84
|
+
Requires-Dist: sympy; extra == "all"
|
|
78
85
|
Requires-Dist: tabulate; extra == "all"
|
|
79
86
|
Requires-Dist: tiktoken; extra == "all"
|
|
80
87
|
Requires-Dist: torch; extra == "all"
|
|
81
88
|
Requires-Dist: tqdm; extra == "all"
|
|
82
89
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
83
90
|
Requires-Dist: transformers-stream-generator; extra == "all"
|
|
91
|
+
Requires-Dist: word2number; extra == "all"
|
|
84
92
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
85
93
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
86
94
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
@@ -92,10 +100,10 @@ Requires-Dist: numpy; extra == "all"
|
|
|
92
100
|
Requires-Dist: sse-starlette; extra == "all"
|
|
93
101
|
Requires-Dist: transformers; extra == "all"
|
|
94
102
|
Requires-Dist: unicorn; extra == "all"
|
|
95
|
-
Requires-Dist: gradio
|
|
103
|
+
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
96
104
|
Requires-Dist: plotly>=5.23.0; extra == "all"
|
|
97
105
|
Provides-Extra: app
|
|
98
|
-
Requires-Dist: gradio
|
|
106
|
+
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
99
107
|
Requires-Dist: plotly>=5.23.0; extra == "app"
|
|
100
108
|
Provides-Extra: inner
|
|
101
109
|
Requires-Dist: absl-py; extra == "inner"
|
|
@@ -215,7 +223,8 @@ Please scan the QR code below to join our community groups:
|
|
|
215
223
|
|
|
216
224
|
|
|
217
225
|
## 🎉 News
|
|
218
|
-
- 🔥 **[2025.
|
|
226
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
227
|
+
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
219
228
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
220
229
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
221
230
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
@@ -416,27 +425,27 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
416
425
|
<table>
|
|
417
426
|
<tr>
|
|
418
427
|
<td style="text-align: center;">
|
|
419
|
-
<img src="docs/
|
|
428
|
+
<img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
|
|
420
429
|
<p>Setting Interface</p>
|
|
421
430
|
</td>
|
|
422
431
|
<td style="text-align: center;">
|
|
423
|
-
<img src="docs/
|
|
432
|
+
<img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
|
|
424
433
|
<p>Model Comparison</p>
|
|
425
434
|
</td>
|
|
426
435
|
</tr>
|
|
427
436
|
<tr>
|
|
428
437
|
<td style="text-align: center;">
|
|
429
|
-
<img src="docs/
|
|
438
|
+
<img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
|
|
430
439
|
<p>Report Overview</p>
|
|
431
440
|
</td>
|
|
432
441
|
<td style="text-align: center;">
|
|
433
|
-
<img src="docs/
|
|
442
|
+
<img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
|
|
434
443
|
<p>Report Details</p>
|
|
435
444
|
</td>
|
|
436
445
|
</tr>
|
|
437
446
|
</table>
|
|
438
447
|
|
|
439
|
-
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/
|
|
448
|
+
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
440
449
|
|
|
441
450
|
## 🌐 Evaluation of Specified Model API
|
|
442
451
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=r8gOMX6i8dWMl_WXLsBdHla7cuauBAyv9apky9VxLsE,4598
|
|
3
|
+
evalscope/config.py,sha256=D7C_K0f0xsfzFUSNSJJUTz3n9tmA6zLDbf8pZ_9ltpw,8600
|
|
4
4
|
evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=qfMqVWlUiXEiIJ665p3-IYWknhIeNZkCJe3Yn07Y74U,5692
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=h6YAZAgeAreWmKtpfr4D6BEvnWZxb1bka9hrpYOO0l8,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,13 +56,15 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=IY2xYmNR58aYnZK7rnUDONWiLQopo_ZifGS2SfN2L-Q,2422
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=xCBvJe4ubgpP1J8ElcWAJwF6B5CSrBEv_uMwQzlUaLY,12540
|
|
61
|
+
evalscope/benchmarks/aime24/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
|
+
evalscope/benchmarks/aime24/aime24_adapter.py,sha256=FYH8NsT1nis3VoBMzRM_ueOsGNXjOKZCa6J_wpUM3RQ,1772
|
|
61
63
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
62
64
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
63
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
65
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=vfwAy01LA141qn1lsSyZmEIGWbbhOCRMOGoSM-K2z6M,6490
|
|
64
66
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
65
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
67
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=37wY3r1qW5qdjyKF-8n7UIM0IVcpaQugMb5Rkjbppxg,8524
|
|
66
68
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
67
69
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
68
70
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -91,73 +93,81 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
91
93
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
92
94
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
93
95
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
94
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
96
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=Qz2oNGw0H_4FtfY-Izdxv9fgwxScJksyvwzeQw-aVyo,11374
|
|
95
97
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
96
|
-
evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
|
|
97
98
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
98
99
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
99
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
100
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=1RmhI0SNxHK-Fz-iTIR76zeBRDLlm0m6_7rJywqk3Rk,10446
|
|
100
101
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
101
102
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
102
103
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
103
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
104
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=i0E4TNajMVcWT8lc5haIjKvdmHuI5qzgpssIm5Fw7bs,7413
|
|
105
|
+
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
106
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=o3Q6ke-RLx4qUbF5FgASZogv3-kCJ6qpK43F_LARU3Y,2496
|
|
107
|
+
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=vDHgsWpsIZQWNadl3mI8M3rDKkvPM2N2KAkW-8aeOHY,5130
|
|
104
109
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
105
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
110
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=fu14ZzGYyg2MEdJbxZGBoIbais6xA9Um2BEAJTvBZZM,3823
|
|
111
|
+
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
|
+
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
113
|
+
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=tiy8Cn1ZmNKjVg8lqNAxWBbsKp8h0uiDNpWuHfcID0A,4689
|
|
106
114
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
107
115
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
108
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
116
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=4qtMX_SfqkXRMgGLOA6tNGMK9EkITWbjLlJT9gWbT20,10664
|
|
109
117
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
110
118
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
111
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
119
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=qArX2umdrYJZkDA9i3XGBGljCton99v5Yss9be9iZYw,6269
|
|
112
120
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
113
121
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
114
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
122
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=onacZB_6SF9239Ly-U70__WYsinS9iWpnf3oiYMNxKc,5164
|
|
115
123
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
117
|
-
evalscope/benchmarks/ifeval/instructions.py,sha256=
|
|
124
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=3HsAdNj5JJGCFA17sPXi-59yv-pfcB0UeXKdY_mQcwU,2015
|
|
125
|
+
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
118
126
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
119
127
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
120
128
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
121
129
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
122
|
-
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=
|
|
130
|
+
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=nv4mzKOPp1YPcr6e7daZuZyQ3jRNNG6PUzi38REuwSk,2356
|
|
131
|
+
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=mBzsllop5sTHw-uK04FjhEWDiEDjDaNUFDUBIVN7Xgg,1742
|
|
123
133
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
124
134
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
125
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256
|
|
135
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=pmT1y9dbWJcZK3U6hkXa3-lBDABx7DhQ7oHc3O-Nkg0,11769
|
|
126
136
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
127
137
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
138
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=Fdrj26MfYmPzio2tI23WTcofrwD69_m41mkVpvlxzVU,4815
|
|
129
139
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
130
140
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
131
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
141
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=dC9I-3T9UFh2OVpmWKRmSszPOlFZAZ40xOPa4zN3daI,6661
|
|
132
142
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
133
143
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
134
144
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
135
145
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
136
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
146
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=GVuJT-Xz4ugVtcUSTRxcBgViHVowcqJf3yVsotcZoZI,5062
|
|
137
147
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
138
148
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
139
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
149
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=6rT1zuQh0nLuYymcchO-cMP98EY0vWizbfTfnUERWgo,12905
|
|
140
150
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
141
151
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
142
152
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
143
|
-
evalscope/cli/start_app.py,sha256=
|
|
153
|
+
evalscope/cli/start_app.py,sha256=_NTmCd15tZOROAnPacGWirMS4OXHrL3n2eZj1kokpks,758
|
|
144
154
|
evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
|
|
145
155
|
evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
|
|
146
156
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
147
157
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
148
|
-
evalscope/collections/evaluator.py,sha256=
|
|
149
|
-
evalscope/collections/sampler.py,sha256=
|
|
150
|
-
evalscope/collections/schema.py,sha256=
|
|
158
|
+
evalscope/collections/evaluator.py,sha256=FJx3KGdLi0-TIqWC_067HEmA4P298BKdwHIrbcai46M,12065
|
|
159
|
+
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
160
|
+
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
151
161
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
152
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
162
|
+
evalscope/evaluator/evaluator.py,sha256=E0NiP5O56WbF8eiUmw9IY2ouotRog9H-2SRyTzZld0I,17569
|
|
153
163
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
154
164
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
155
165
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
156
166
|
evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
|
|
157
167
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
158
|
-
evalscope/metrics/
|
|
159
|
-
evalscope/metrics/metrics.py,sha256=
|
|
160
|
-
evalscope/metrics/named_metrics.py,sha256=
|
|
168
|
+
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
169
|
+
evalscope/metrics/metrics.py,sha256=r4FHyEvvFhMu0vAHBw-ByFefObDBC3DQdr53klSk6Wk,13325
|
|
170
|
+
evalscope/metrics/named_metrics.py,sha256=SeBXmgWyK4y4tKiGKro3k-CZU1OShuKe6qxwpT3tizY,1313
|
|
161
171
|
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
162
172
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
163
173
|
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
|
|
@@ -165,12 +175,12 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
|
|
|
165
175
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
166
176
|
evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
|
|
167
177
|
evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
|
|
168
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
169
|
-
evalscope/models/choice_adapter.py,sha256=
|
|
170
|
-
evalscope/models/custom_adapter.py,sha256=
|
|
171
|
-
evalscope/models/local_model.py,sha256=
|
|
178
|
+
evalscope/models/chat_adapter.py,sha256=nOrNDuvuNKkTcW9zNcR_EIqbzkqK5PFws-5YsSxBR9E,6120
|
|
179
|
+
evalscope/models/choice_adapter.py,sha256=jj_6KB1BAsvv4Yufn2bM2tCiLovFUum2368lseogmb8,8036
|
|
180
|
+
evalscope/models/custom_adapter.py,sha256=Ed_MGEcZxKK4mkXTpUY4GXTsayprHzIEOC1L9gqwjf4,2284
|
|
181
|
+
evalscope/models/local_model.py,sha256=s0YVX9Djqazusk7qzSpWQB76jGGuzJxqQlZzomsCFsk,2621
|
|
172
182
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
173
|
-
evalscope/models/server_adapter.py,sha256=
|
|
183
|
+
evalscope/models/server_adapter.py,sha256=iVJuUJlHGVGxnlrDMnbHZ8WQ4OR2HK5HrXH4obD2_cg,4173
|
|
174
184
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
175
185
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
176
186
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
@@ -222,7 +232,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
222
232
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
223
233
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
224
234
|
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
225
|
-
evalscope/report/app.py,sha256=
|
|
235
|
+
evalscope/report/app.py,sha256=adP1rVVOxYMbCTdopV3FKWBhUzB7t1AXcDOxW4Ct56g,26647
|
|
226
236
|
evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
|
|
227
237
|
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
228
238
|
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
@@ -255,9 +265,9 @@ evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1
|
|
|
255
265
|
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
|
|
256
266
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
257
267
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
258
|
-
evalscope/utils/chat_service.py,sha256=
|
|
268
|
+
evalscope/utils/chat_service.py,sha256=eZ8uyVeVFpXZo_uvRFyVhnFyJpL14zcn9UA6K4Ax5J4,8676
|
|
259
269
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
260
|
-
evalscope/utils/io_utils.py,sha256=
|
|
270
|
+
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
261
271
|
evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
|
|
262
272
|
evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
|
|
263
273
|
evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
|
|
@@ -265,7 +275,7 @@ tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
|
265
275
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
266
276
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
267
277
|
tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
|
|
268
|
-
tests/cli/test_run.py,sha256=
|
|
278
|
+
tests/cli/test_run.py,sha256=gtId2SF1LlDCIn4S_WKRpAyTig_pWOhY8yto4P5B1EY,8303
|
|
269
279
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
270
280
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
271
281
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -278,9 +288,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
278
288
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
279
289
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
280
290
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
281
|
-
evalscope-0.
|
|
282
|
-
evalscope-0.
|
|
283
|
-
evalscope-0.
|
|
284
|
-
evalscope-0.
|
|
285
|
-
evalscope-0.
|
|
286
|
-
evalscope-0.
|
|
291
|
+
evalscope-0.11.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
292
|
+
evalscope-0.11.0.dist-info/METADATA,sha256=GL8Ybyby65DYg8jxjxzdcFYvXBhKzE7eRFIBRiJ0-hc,29584
|
|
293
|
+
evalscope-0.11.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
294
|
+
evalscope-0.11.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
295
|
+
evalscope-0.11.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
296
|
+
evalscope-0.11.0.dist-info/RECORD,,
|
tests/cli/test_run.py
CHANGED
|
@@ -73,16 +73,18 @@ class TestRun(unittest.TestCase):
|
|
|
73
73
|
def test_run_task(self):
|
|
74
74
|
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
|
|
75
75
|
'datasets': [
|
|
76
|
-
'mmlu_pro',
|
|
76
|
+
# 'mmlu_pro',
|
|
77
77
|
# 'bbh',
|
|
78
|
-
'hellaswag',
|
|
78
|
+
# 'hellaswag',
|
|
79
79
|
# 'gsm8k',
|
|
80
|
-
# 'arc'
|
|
80
|
+
# 'arc',
|
|
81
81
|
# 'race',
|
|
82
|
+
'ifeval',
|
|
82
83
|
# 'truthful_qa',
|
|
83
84
|
# 'trivia_qa',
|
|
84
85
|
],
|
|
85
|
-
'limit':
|
|
86
|
+
'limit': 2,
|
|
87
|
+
'eval_batch_size': 2,
|
|
86
88
|
'debug': True}
|
|
87
89
|
run_task(task_cfg=task_cfg)
|
|
88
90
|
|
|
@@ -93,9 +95,9 @@ class TestRun(unittest.TestCase):
|
|
|
93
95
|
|
|
94
96
|
task_cfg = TaskConfig(
|
|
95
97
|
model='qwen/Qwen2-0.5B-Instruct',
|
|
96
|
-
datasets=['
|
|
98
|
+
datasets=['general_mcq', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
|
|
97
99
|
dataset_args={
|
|
98
|
-
'
|
|
100
|
+
'general_mcq': {
|
|
99
101
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
100
102
|
'subset_list': [
|
|
101
103
|
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
@@ -117,8 +119,17 @@ class TestRun(unittest.TestCase):
|
|
|
117
119
|
|
|
118
120
|
task_cfg = TaskConfig(
|
|
119
121
|
model='qwen/Qwen2-0.5B-Instruct',
|
|
120
|
-
datasets=[
|
|
121
|
-
|
|
122
|
+
datasets=[
|
|
123
|
+
# 'math_500',
|
|
124
|
+
# 'aime24',
|
|
125
|
+
'competition_math'
|
|
126
|
+
],
|
|
127
|
+
dataset_args={
|
|
128
|
+
'competition_math': {
|
|
129
|
+
'subset_list': ['Level 4', 'Level 5']
|
|
130
|
+
}
|
|
131
|
+
},
|
|
132
|
+
limit=5
|
|
122
133
|
)
|
|
123
134
|
|
|
124
135
|
run_task(task_cfg=task_cfg)
|
|
@@ -128,12 +139,12 @@ class TestRun(unittest.TestCase):
|
|
|
128
139
|
from evalscope.config import TaskConfig
|
|
129
140
|
|
|
130
141
|
task_cfg = TaskConfig(
|
|
131
|
-
model='Qwen2.5-
|
|
142
|
+
model='Qwen2.5-0.5B-Instruct',
|
|
132
143
|
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
133
144
|
api_key='EMPTY',
|
|
134
145
|
eval_type=EvalType.SERVICE,
|
|
135
146
|
datasets=[
|
|
136
|
-
'iquiz',
|
|
147
|
+
# 'iquiz',
|
|
137
148
|
# 'ifeval',
|
|
138
149
|
# 'mmlu',
|
|
139
150
|
# 'mmlu_pro',
|
|
@@ -141,25 +152,91 @@ class TestRun(unittest.TestCase):
|
|
|
141
152
|
# 'trivia_qa',
|
|
142
153
|
# 'cmmlu',
|
|
143
154
|
# 'humaneval',
|
|
144
|
-
# 'competition_math',
|
|
145
155
|
# 'gsm8k',
|
|
156
|
+
# 'bbh',
|
|
157
|
+
'competition_math',
|
|
158
|
+
'math_500',
|
|
159
|
+
'aime24',
|
|
160
|
+
'gpqa',
|
|
146
161
|
# 'arc',
|
|
147
162
|
# 'ceval',
|
|
148
|
-
# 'bbh',
|
|
149
163
|
# 'hellaswag',
|
|
150
164
|
],
|
|
151
165
|
dataset_args={
|
|
166
|
+
'mmlu': {
|
|
167
|
+
'subset_list': ['elementary_mathematics'],
|
|
168
|
+
'few_shot_num': 0
|
|
169
|
+
},
|
|
170
|
+
'mmlu_pro': {
|
|
171
|
+
'subset_list': ['math'],
|
|
172
|
+
'few_shot_num': 0
|
|
173
|
+
},
|
|
152
174
|
'ceval': {
|
|
153
175
|
'subset_list': [
|
|
154
|
-
'computer_network', 'operating_system', 'computer_architecture'
|
|
155
|
-
]
|
|
156
|
-
|
|
176
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
177
|
+
],
|
|
178
|
+
'few_shot_num': 0
|
|
179
|
+
},
|
|
180
|
+
'cmmlu': {
|
|
181
|
+
'subset_list': ['elementary_chinese'],
|
|
182
|
+
'few_shot_num': 0
|
|
183
|
+
},
|
|
184
|
+
'bbh': {
|
|
185
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
186
|
+
},
|
|
187
|
+
'gpqa': {
|
|
188
|
+
'subset_list': ['gpqa_diamond'],
|
|
189
|
+
'few_shot_num': 0
|
|
190
|
+
},
|
|
191
|
+
'humaneval': {
|
|
192
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
193
|
+
},
|
|
194
|
+
'competition_math': {
|
|
195
|
+
'subset_list': ['Level 1']
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
eval_batch_size=5,
|
|
199
|
+
limit=10,
|
|
200
|
+
debug=True,
|
|
201
|
+
generation_config={
|
|
202
|
+
'temperature': 0.7,
|
|
203
|
+
'n': 5
|
|
157
204
|
},
|
|
158
|
-
|
|
205
|
+
use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525'
|
|
159
206
|
)
|
|
160
207
|
|
|
161
208
|
run_task(task_cfg=task_cfg)
|
|
162
209
|
|
|
163
210
|
|
|
211
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
212
|
+
def test_run_batch_eval(self):
|
|
213
|
+
from evalscope.config import TaskConfig
|
|
214
|
+
|
|
215
|
+
task_cfg = TaskConfig(
|
|
216
|
+
model='LLM-Research/Llama-3.2-1B-Instruct',
|
|
217
|
+
datasets=[
|
|
218
|
+
# 'math_500',
|
|
219
|
+
# 'aime24',
|
|
220
|
+
# 'competition_math'
|
|
221
|
+
# 'arc',
|
|
222
|
+
'gsm8k'
|
|
223
|
+
# 'truthful_qa'
|
|
224
|
+
],
|
|
225
|
+
dataset_args={
|
|
226
|
+
'competition_math': {
|
|
227
|
+
'subset_list': ['Level 4', 'Level 5']
|
|
228
|
+
}
|
|
229
|
+
},
|
|
230
|
+
eval_batch_size=2,
|
|
231
|
+
limit=5,
|
|
232
|
+
generation_config={
|
|
233
|
+
'max_new_tokens': 2048,
|
|
234
|
+
'temperature': 0.7,
|
|
235
|
+
'num_return_sequences': 2,
|
|
236
|
+
}
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
run_task(task_cfg=task_cfg)
|
|
240
|
+
|
|
164
241
|
if __name__ == '__main__':
|
|
165
242
|
unittest.main()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{'id': 0, 'question': '下列关于税法基本原则的表述中,不正确的是____。', 'A': '税收法定原则包括税收要件法定原则和税务合法性原则', 'B': '税收公平原则源于法律上的平等性原则', 'C': '税收效率原则包含经济效率和行政效率两个方面', 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定', 'answer': 'D', 'explanation': ''}
|