evalscope 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +14 -17
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +9 -9
- evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +31 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +25 -11
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +90 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +11 -2
- evalscope/config.py +10 -2
- evalscope/constants.py +7 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/http_client.py +6 -4
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +5 -4
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +7 -3
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +68 -58
- tests/cli/test_collection.py +1 -1
- tests/cli/test_run.py +135 -28
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
evalscope/utils/logger.py
CHANGED
|
@@ -12,12 +12,12 @@ detailed_formatter = logging.Formatter(detailed_format)
|
|
|
12
12
|
simple_formatter = logging.Formatter(simple_format)
|
|
13
13
|
DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
14
14
|
|
|
15
|
-
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
|
|
15
|
+
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
|
|
16
16
|
|
|
17
|
-
#
|
|
17
|
+
# set logging level
|
|
18
18
|
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
19
|
-
logging.getLogger('modelscope').setLevel(logging.WARNING)
|
|
20
19
|
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
20
|
+
logging.getLogger('modelscope').setLevel(logging.ERROR)
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -225,10 +225,14 @@ Please scan the QR code below to join our community groups:
|
|
|
225
225
|
|
|
226
226
|
|
|
227
227
|
## 🎉 News
|
|
228
|
-
|
|
228
|
+
|
|
229
|
+
- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
|
|
230
|
+
- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
|
|
231
|
+
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
232
|
+
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
229
233
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
230
234
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
231
|
-
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/
|
|
235
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
232
236
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
233
237
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
234
238
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=QT3f_oBDl1jXl68rgHVBsOxWeJTw1zXFmm7Zu1VRMQU,4826
|
|
3
|
+
evalscope/config.py,sha256=eQ_r94W_uQiF9ZWN-k84KxrT85E3YiJklDuM5mIKt_s,9124
|
|
4
|
+
evalscope/constants.py,sha256=l6xkVknVybi3frXaftksRZNaCFcw9ZJZ8ORJeWDJEaQ,3615
|
|
5
|
+
evalscope/run.py,sha256=ae6WsKllRt5xanRRFJWSBkVEjCf-Lgx35nlLyqOxctU,5785
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=KVyRitFqvCQM-1iaU2VOfx7rh9IDqOUGstYhQ6DLAI4,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,16 +56,17 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=AByXFsuia3lqCLFsPRt95UR7SxwEuAGpeuKBVjb7jLE,2463
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=JwptQHL4DbcZ_Ll0kJ0QL8rgK2ZVFftyAXiUWKcrvL4,15532
|
|
61
|
+
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
61
62
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
|
-
evalscope/benchmarks/aime/aime24_adapter.py,sha256=
|
|
63
|
-
evalscope/benchmarks/aime/aime25_adapter.py,sha256=
|
|
63
|
+
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
64
|
+
evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
|
|
64
65
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
65
66
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
66
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
67
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=8ksPc6IM266NE7F9Bo-Y9SRZZM-tlCKPfLbJg3VEq9w,6269
|
|
67
68
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
68
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
69
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
|
|
69
70
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
70
71
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
71
72
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -94,65 +95,72 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
94
95
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
95
96
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
96
97
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
97
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
98
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=B3nO0WmqSyH-LlicqreIPWrxXgVPt1rrp3ndc7YRYiE,11157
|
|
98
99
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
99
100
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
100
101
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
101
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
102
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=zNaYSelcGZulgFLQXp2eD56_QOFRkaXHknfy_VWJciA,10230
|
|
102
103
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
103
104
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
104
105
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
105
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
106
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=F2YCaNDn49X82l06WlLFp2OPFB7nv0ecW40099I9iSE,6871
|
|
106
107
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
|
-
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=
|
|
108
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS3-z03YW8nafooFJ7x60e5uEpBO5z_c7zk8,2450
|
|
108
109
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
109
|
-
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=
|
|
110
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
110
111
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
111
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
112
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=wnKUIVc1UvnjI5XGOHf5aCx0H0xTKoZZWAD-Q8AJNAE,4686
|
|
112
113
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
114
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
114
|
-
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=
|
|
115
|
+
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
115
116
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
116
117
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
117
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
118
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
|
|
118
119
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
119
120
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
120
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
121
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=2CnrIapK51l4bQyFKWWqmOaeBSpkIlq2asetWcp24gs,6057
|
|
121
122
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
122
123
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
123
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
124
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=YK4u3JG_Ub4vP-xnsrf-lMheIBdCgFWmirhPUch3biU,5120
|
|
124
125
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
|
|
126
127
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
127
128
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
128
129
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
129
130
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
130
131
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
|
-
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=
|
|
132
|
+
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
|
|
132
133
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
|
-
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=
|
|
134
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
134
135
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
135
136
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
136
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
137
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=RMZoHAApVOpD3_NeHLcsiM7SpglKpfrGSUhBWPgdAVE,11525
|
|
137
138
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
138
139
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
139
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
140
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
|
|
140
141
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
141
|
-
evalscope/benchmarks/musr/musr_adapter.py,sha256=
|
|
142
|
+
evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
|
|
142
143
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
143
144
|
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
144
|
-
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=
|
|
145
|
+
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
|
|
145
146
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
146
147
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
147
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
148
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
148
149
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
150
|
+
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=FZwXN78X2fV3Dchop_UuFAhNFkwWs12qJlIczgvvrJ8,477
|
|
152
|
+
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
|
+
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
154
|
+
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
155
|
+
evalscope/benchmarks/super_gpqa/utils.py,sha256=uhANVnoIaH8-QuzjcVuyVB-8aGOMy94XKUF-TFemY_Q,3578
|
|
156
|
+
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
149
157
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
150
158
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
151
159
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
152
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
160
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb5BSyjO2eD4On6gX8xqlkV8,4961
|
|
153
161
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
154
162
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
155
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
163
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
|
|
156
164
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
157
165
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
158
166
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
@@ -161,7 +169,7 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
161
169
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
162
170
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
163
171
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
164
|
-
evalscope/collections/evaluator.py,sha256=
|
|
172
|
+
evalscope/collections/evaluator.py,sha256=Zi3uRZhSRIimYye_apZWL6VOiHqaM5znbFA4TBvqSbg,12761
|
|
165
173
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
166
174
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
167
175
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
@@ -173,27 +181,28 @@ evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg
|
|
|
173
181
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
174
182
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
175
183
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
176
|
-
evalscope/metrics/named_metrics.py,sha256=
|
|
184
|
+
evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
|
|
177
185
|
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
178
186
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
179
187
|
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
|
|
180
188
|
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
|
|
181
189
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
182
|
-
evalscope/models/__init__.py,sha256=
|
|
183
|
-
evalscope/models/base_adapter.py,sha256=
|
|
184
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
185
|
-
evalscope/models/choice_adapter.py,sha256=
|
|
186
|
-
evalscope/models/custom_adapter.py,sha256=
|
|
190
|
+
evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
|
|
191
|
+
evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
|
|
192
|
+
evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
|
|
193
|
+
evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
|
|
194
|
+
evalscope/models/custom_adapter.py,sha256=Za52WF1I_YcJkGomJ6s9sP2Fs8DoJ4HHBYBi3iC3WNI,2379
|
|
187
195
|
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
188
196
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
189
|
-
evalscope/models/
|
|
197
|
+
evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
|
|
198
|
+
evalscope/models/server_adapter.py,sha256=dS_o9_iC8QY73AehIekYwBQieFECZ97JRfbfleJ-Dtk,6845
|
|
190
199
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
191
200
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
192
201
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
193
202
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
194
|
-
evalscope/perf/arguments.py,sha256=
|
|
203
|
+
evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
|
|
195
204
|
evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
|
|
196
|
-
evalscope/perf/http_client.py,sha256=
|
|
205
|
+
evalscope/perf/http_client.py,sha256=eoRPaBTCVC4DpgH4tnc-31_h_2PVkWUwCLWK6_TTkhM,7282
|
|
197
206
|
evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
|
|
198
207
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
199
208
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
@@ -211,7 +220,7 @@ evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYF
|
|
|
211
220
|
evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
|
|
212
221
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
213
222
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
214
|
-
evalscope/perf/utils/analysis_result.py,sha256=
|
|
223
|
+
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
215
224
|
evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
|
|
216
225
|
evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
|
|
217
226
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
@@ -238,8 +247,8 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
238
247
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
239
248
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
240
249
|
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
241
|
-
evalscope/report/app.py,sha256=
|
|
242
|
-
evalscope/report/combinator.py,sha256=
|
|
250
|
+
evalscope/report/app.py,sha256=cvof2Nm4ORxC4D3L22Kg3Ngu3kJwBZlfnFJkwDMCmSQ,26881
|
|
251
|
+
evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
|
|
243
252
|
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
244
253
|
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
245
254
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -260,12 +269,12 @@ evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc
|
|
|
260
269
|
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
|
|
261
270
|
evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
|
|
262
271
|
evalscope/third_party/thinkbench/__init__.py,sha256=C0aSu71_dc1upUVkKmq2VgDd9plpRcYUdCE6BjUWJcA,110
|
|
263
|
-
evalscope/third_party/thinkbench/eval.py,sha256=
|
|
264
|
-
evalscope/third_party/thinkbench/infer.py,sha256
|
|
272
|
+
evalscope/third_party/thinkbench/eval.py,sha256=76G4LTkxqWCDCyj7Ahjj-qjO1gFem1uDzpRAC27ICl0,18896
|
|
273
|
+
evalscope/third_party/thinkbench/infer.py,sha256=2L4DAJKn3wAhNEKnKudQT60igGOJSKH80FR4nS7DHYk,3952
|
|
265
274
|
evalscope/third_party/thinkbench/resources/critique_template.txt,sha256=d4Egc-qH--4lG8X_EcmgymnuZgiCMbee1M5pt4HrRKA,535
|
|
266
275
|
evalscope/third_party/thinkbench/resources/reformat_template.txt,sha256=zTZyVAzmMBtAwI9lHly9EXsqX471OW-VTg538PDcB30,1775
|
|
267
276
|
evalscope/third_party/thinkbench/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
268
|
-
evalscope/third_party/thinkbench/tools/llm.py,sha256=
|
|
277
|
+
evalscope/third_party/thinkbench/tools/llm.py,sha256=HCFh58_THsVrFVzvGoThwWRu8EbPXD0DotLQEj5u4Tg,1353
|
|
269
278
|
evalscope/third_party/thinkbench/tools/utils.py,sha256=rDu2GVTK4ji9Yh9RLVksZqrfurQsSuN9GW3QCKJ60ng,401
|
|
270
279
|
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
271
280
|
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
@@ -276,20 +285,21 @@ evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo
|
|
|
276
285
|
evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
|
|
277
286
|
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
|
|
278
287
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
279
|
-
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=
|
|
288
|
+
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
|
|
280
289
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
281
290
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
282
|
-
evalscope/utils/chat_service.py,sha256=
|
|
291
|
+
evalscope/utils/chat_service.py,sha256=9LNTT-8KsacOLqnQer8j57e224rwOMbU7txV6re-X-A,8720
|
|
283
292
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
293
|
+
evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
284
294
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
285
|
-
evalscope/utils/logger.py,sha256=
|
|
295
|
+
evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
|
|
286
296
|
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
287
297
|
evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
|
|
288
298
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
289
299
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
290
300
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
291
|
-
tests/cli/test_collection.py,sha256
|
|
292
|
-
tests/cli/test_run.py,sha256=
|
|
301
|
+
tests/cli/test_collection.py,sha256=-CrcAiZVtsY7mXUNVlRjhFWEgmPL5k1dH9PjNhKzYdU,3028
|
|
302
|
+
tests/cli/test_run.py,sha256=flwZZ1PyMnrxy5f36mdUeGSO_ANpr2588dw1zHVQYJY,12735
|
|
293
303
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
294
304
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
295
305
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -302,9 +312,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
302
312
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
303
313
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
304
314
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
305
|
-
evalscope-0.12.
|
|
306
|
-
evalscope-0.12.
|
|
307
|
-
evalscope-0.12.
|
|
308
|
-
evalscope-0.12.
|
|
309
|
-
evalscope-0.12.
|
|
310
|
-
evalscope-0.12.
|
|
315
|
+
evalscope-0.12.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
316
|
+
evalscope-0.12.1.dist-info/METADATA,sha256=jdU1I5E3YNc8PLfY0NYYDTKiXzTE4HYtX5J6OUPkQ_s,31337
|
|
317
|
+
evalscope-0.12.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
318
|
+
evalscope-0.12.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
319
|
+
evalscope-0.12.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
320
|
+
evalscope-0.12.1.dist-info/RECORD,,
|
tests/cli/test_collection.py
CHANGED
|
@@ -44,7 +44,7 @@ class TestCollection(unittest.TestCase):
|
|
|
44
44
|
from evalscope import TaskConfig, run_task
|
|
45
45
|
|
|
46
46
|
task_cfg = TaskConfig(
|
|
47
|
-
model='Qwen2.5-
|
|
47
|
+
model='Qwen2.5-0.5B-Instruct',
|
|
48
48
|
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
49
49
|
api_key='EMPTY',
|
|
50
50
|
eval_type=EvalType.SERVICE,
|
tests/cli/test_run.py
CHANGED
|
@@ -4,7 +4,8 @@ import subprocess
|
|
|
4
4
|
import torch
|
|
5
5
|
import unittest
|
|
6
6
|
|
|
7
|
-
from evalscope.
|
|
7
|
+
from evalscope.config import TaskConfig
|
|
8
|
+
from evalscope.constants import EvalType, OutputType
|
|
8
9
|
from evalscope.run import run_task
|
|
9
10
|
from evalscope.utils import is_module_installed, test_level_list
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -71,21 +72,104 @@ class TestRun(unittest.TestCase):
|
|
|
71
72
|
|
|
72
73
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
73
74
|
def test_run_task(self):
|
|
74
|
-
task_cfg =
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
75
|
+
task_cfg = TaskConfig(
|
|
76
|
+
model='qwen/Qwen2.5-0.5B-Instruct',
|
|
77
|
+
datasets=[
|
|
78
|
+
'iquiz',
|
|
79
|
+
# 'ifeval',
|
|
80
|
+
# 'mmlu',
|
|
81
|
+
# 'mmlu_pro',
|
|
82
|
+
# 'musr',
|
|
83
|
+
# 'process_bench',
|
|
84
|
+
# 'race',
|
|
85
|
+
# 'trivia_qa',
|
|
86
|
+
# 'cmmlu',
|
|
87
|
+
# 'humaneval',
|
|
88
|
+
# 'super_gpqa',
|
|
89
|
+
# 'gsm8k',
|
|
90
|
+
# 'bbh',
|
|
91
|
+
# 'competition_math',
|
|
92
|
+
# 'math_500',
|
|
93
|
+
'aime24',
|
|
94
|
+
'gpqa',
|
|
95
|
+
# 'arc',
|
|
96
|
+
# 'ceval',
|
|
97
|
+
# 'hellaswag',
|
|
98
|
+
# 'general_mcq',
|
|
99
|
+
# 'general_qa'
|
|
100
|
+
],
|
|
101
|
+
dataset_args={
|
|
102
|
+
'mmlu': {
|
|
103
|
+
'subset_list': ['elementary_mathematics'],
|
|
104
|
+
'few_shot_num': 0
|
|
105
|
+
},
|
|
106
|
+
'mmlu_pro': {
|
|
107
|
+
'subset_list': ['math', 'health'],
|
|
108
|
+
'few_shot_num': 4
|
|
109
|
+
},
|
|
110
|
+
'ceval': {
|
|
111
|
+
'subset_list': [
|
|
112
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
113
|
+
],
|
|
114
|
+
'few_shot_num': 0
|
|
115
|
+
},
|
|
116
|
+
'cmmlu': {
|
|
117
|
+
'subset_list': ['elementary_chinese'],
|
|
118
|
+
'few_shot_num': 0
|
|
119
|
+
},
|
|
120
|
+
'bbh': {
|
|
121
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
122
|
+
},
|
|
123
|
+
'gpqa': {
|
|
124
|
+
'subset_list': ['gpqa_diamond'],
|
|
125
|
+
'few_shot_num': 0
|
|
126
|
+
},
|
|
127
|
+
'humaneval': {
|
|
128
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
129
|
+
},
|
|
130
|
+
'competition_math': {
|
|
131
|
+
'subset_list': ['Level 1']
|
|
132
|
+
},
|
|
133
|
+
'process_bench': {
|
|
134
|
+
'subset_list': ['gsm8k'],
|
|
135
|
+
},
|
|
136
|
+
'musr': {
|
|
137
|
+
'subset_list': ['murder_mysteries']
|
|
138
|
+
},
|
|
139
|
+
'general_mcq': {
|
|
140
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
141
|
+
'subset_list': [
|
|
142
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
143
|
+
],
|
|
144
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
145
|
+
},
|
|
146
|
+
'general_qa': {
|
|
147
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
148
|
+
'subset_list': [
|
|
149
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
150
|
+
# 'test'
|
|
151
|
+
],
|
|
152
|
+
'metric_list': ['AverageBLEU']
|
|
153
|
+
},
|
|
154
|
+
'super_gpqa': {
|
|
155
|
+
'subset_list': ['Philosophy', 'Education'],
|
|
156
|
+
'few_shot_num': 0
|
|
157
|
+
},
|
|
158
|
+
'ifeval': {
|
|
159
|
+
'filters': {
|
|
160
|
+
'remove_until': '</think>'
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
},
|
|
164
|
+
limit=2,
|
|
165
|
+
eval_batch_size=2,
|
|
166
|
+
generation_config={
|
|
167
|
+
'max_new_tokens': 2048,
|
|
168
|
+
'temperature': 0.7,
|
|
169
|
+
'num_return_sequences': 1,
|
|
170
|
+
},
|
|
171
|
+
# debug=True
|
|
172
|
+
)
|
|
89
173
|
run_task(task_cfg=task_cfg)
|
|
90
174
|
|
|
91
175
|
|
|
@@ -146,7 +230,7 @@ class TestRun(unittest.TestCase):
|
|
|
146
230
|
api_key='EMPTY',
|
|
147
231
|
eval_type=EvalType.SERVICE,
|
|
148
232
|
datasets=[
|
|
149
|
-
'iquiz',
|
|
233
|
+
# 'iquiz',
|
|
150
234
|
# 'ifeval',
|
|
151
235
|
# 'mmlu',
|
|
152
236
|
# 'mmlu_pro',
|
|
@@ -161,10 +245,13 @@ class TestRun(unittest.TestCase):
|
|
|
161
245
|
# 'competition_math',
|
|
162
246
|
# 'math_500',
|
|
163
247
|
# 'aime24',
|
|
164
|
-
|
|
248
|
+
'gpqa',
|
|
165
249
|
# 'arc',
|
|
166
|
-
|
|
250
|
+
'ceval',
|
|
167
251
|
# 'hellaswag',
|
|
252
|
+
# 'general_mcq',
|
|
253
|
+
# 'general_qa'
|
|
254
|
+
# 'super_gpqa',
|
|
168
255
|
],
|
|
169
256
|
dataset_args={
|
|
170
257
|
'mmlu': {
|
|
@@ -189,8 +276,9 @@ class TestRun(unittest.TestCase):
|
|
|
189
276
|
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
190
277
|
},
|
|
191
278
|
'gpqa': {
|
|
192
|
-
'subset_list': ['gpqa_diamond'],
|
|
193
|
-
'few_shot_num': 0
|
|
279
|
+
# 'subset_list': ['gpqa_diamond'],
|
|
280
|
+
'few_shot_num': 0,
|
|
281
|
+
'local_path': './data/data/gpqa',
|
|
194
282
|
},
|
|
195
283
|
'humaneval': {
|
|
196
284
|
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
@@ -204,17 +292,36 @@ class TestRun(unittest.TestCase):
|
|
|
204
292
|
'musr': {
|
|
205
293
|
'subset_list': ['murder_mysteries']
|
|
206
294
|
},
|
|
295
|
+
'general_mcq': {
|
|
296
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
297
|
+
'subset_list': [
|
|
298
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
299
|
+
],
|
|
300
|
+
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
301
|
+
},
|
|
302
|
+
'general_qa': {
|
|
303
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
304
|
+
'subset_list': [
|
|
305
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
306
|
+
# 'test'
|
|
307
|
+
],
|
|
308
|
+
'metric_list': ['AverageBLEU']
|
|
309
|
+
},
|
|
310
|
+
'super_gpqa': {
|
|
311
|
+
# 'subset_list': ['Philosophy', 'Education'],
|
|
312
|
+
'few_shot_num': 0
|
|
313
|
+
}
|
|
207
314
|
},
|
|
208
|
-
eval_batch_size=
|
|
209
|
-
limit=
|
|
210
|
-
debug=True,
|
|
211
|
-
stream=
|
|
315
|
+
eval_batch_size=32,
|
|
316
|
+
limit=10,
|
|
317
|
+
# debug=True,
|
|
318
|
+
stream=False,
|
|
212
319
|
generation_config={
|
|
213
|
-
'temperature': 0
|
|
320
|
+
'temperature': 0,
|
|
214
321
|
'n': 1,
|
|
215
|
-
'max_tokens':
|
|
322
|
+
'max_tokens': 4096,
|
|
216
323
|
},
|
|
217
|
-
# use_cache='
|
|
324
|
+
# use_cache='./outputs/20250212_150525',
|
|
218
325
|
)
|
|
219
326
|
|
|
220
327
|
run_task(task_cfg=task_cfg)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|