evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +15 -18
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +12 -11
- evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +59 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
- evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +85 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +14 -5
- evalscope/config.py +15 -2
- evalscope/constants.py +14 -0
- evalscope/evaluator/evaluator.py +51 -13
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/benchmark.py +5 -0
- evalscope/perf/http_client.py +15 -5
- evalscope/perf/main.py +1 -0
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +28 -2
- tests/cli/test_run.py +201 -32
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
evalscope/utils/logger.py
CHANGED
|
@@ -12,12 +12,12 @@ detailed_formatter = logging.Formatter(detailed_format)
|
|
|
12
12
|
simple_formatter = logging.Formatter(simple_format)
|
|
13
13
|
DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
14
14
|
|
|
15
|
-
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
|
|
15
|
+
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
|
|
16
16
|
|
|
17
|
-
#
|
|
17
|
+
# set logging level
|
|
18
18
|
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
19
|
-
logging.getLogger('modelscope').setLevel(logging.WARNING)
|
|
20
19
|
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
20
|
+
logging.getLogger('modelscope').setLevel(logging.ERROR)
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.13.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -175,16 +175,29 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
175
175
|
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
176
176
|
|
|
177
177
|
## 📋 Contents
|
|
178
|
-
- [
|
|
179
|
-
- [
|
|
180
|
-
- [
|
|
181
|
-
- [
|
|
178
|
+
- [📋 Contents](#-contents)
|
|
179
|
+
- [📝 Introduction](#-introduction)
|
|
180
|
+
- [☎ User Groups](#-user-groups)
|
|
181
|
+
- [🎉 News](#-news)
|
|
182
|
+
- [🛠️ Installation](#️-installation)
|
|
183
|
+
- [Method 1: Install Using pip](#method-1-install-using-pip)
|
|
184
|
+
- [Method 2: Install from Source](#method-2-install-from-source)
|
|
185
|
+
- [🚀 Quick Start](#-quick-start)
|
|
186
|
+
- [Method 1. Using Command Line](#method-1-using-command-line)
|
|
187
|
+
- [Method 2. Using Python Code](#method-2-using-python-code)
|
|
188
|
+
- [Basic Parameter](#basic-parameter)
|
|
189
|
+
- [Output Results](#output-results)
|
|
190
|
+
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
191
|
+
- [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
|
|
192
|
+
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
193
|
+
- [Parameter](#parameter)
|
|
182
194
|
- [Evaluation Backend](#evaluation-backend)
|
|
183
|
-
- [
|
|
184
|
-
- [
|
|
185
|
-
- [Arena Mode](
|
|
186
|
-
- [Contribution](#️-contribution)
|
|
187
|
-
- [Roadmap](#-roadmap)
|
|
195
|
+
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
196
|
+
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
197
|
+
- [🏟️ Arena Mode](#️-arena-mode)
|
|
198
|
+
- [👷♂️ Contribution](#️-contribution)
|
|
199
|
+
- [🔜 Roadmap](#-roadmap)
|
|
200
|
+
- [Star History](#star-history)
|
|
188
201
|
|
|
189
202
|
|
|
190
203
|
## 📝 Introduction
|
|
@@ -225,10 +238,16 @@ Please scan the QR code below to join our community groups:
|
|
|
225
238
|
|
|
226
239
|
|
|
227
240
|
## 🎉 News
|
|
228
|
-
|
|
241
|
+
|
|
242
|
+
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
|
|
243
|
+
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
244
|
+
- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
|
|
245
|
+
- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
|
|
246
|
+
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
247
|
+
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
229
248
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
230
249
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
231
|
-
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/
|
|
250
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
232
251
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
233
252
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
234
253
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
|
|
3
|
+
evalscope/config.py,sha256=9bMV7wf8pM7N5dEj_kJsCq6oM8xobzQDYh0NF8h-j1I,9313
|
|
4
|
+
evalscope/constants.py,sha256=ydS8oihksGnvvzvJZw7HGhEeeccHNpJxspB81gAv29Y,3720
|
|
5
|
+
evalscope/run.py,sha256=Udz-H503UaMYos0ic3A_npXIbnd4eJLx26q5UEahF-U,5797
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=a1r1BkZoSpoA_eGXZoXm6WaLayRHhF__TgvE9xG-Whs,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,16 +56,17 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=2u9oC4RBHVfEMHKPRu87xM4XOw_RS2Z2fvagNsciEo4,16791
|
|
61
|
+
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
61
62
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
|
-
evalscope/benchmarks/aime/aime24_adapter.py,sha256=
|
|
63
|
-
evalscope/benchmarks/aime/aime25_adapter.py,sha256=
|
|
63
|
+
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
64
|
+
evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
|
|
64
65
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
65
66
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
66
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
67
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
|
|
67
68
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
68
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
69
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
|
|
69
70
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
70
71
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
71
72
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -94,65 +95,83 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
94
95
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
95
96
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
96
97
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
97
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
98
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
|
|
98
99
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
100
|
+
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=nKF_a0yc_PbZYjYA_-gJh3ePZIEz5txrhDV4IsTqD4Q,8196
|
|
99
102
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
100
103
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
101
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
104
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
|
|
102
105
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
103
106
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
104
107
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
105
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
108
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=F2YCaNDn49X82l06WlLFp2OPFB7nv0ecW40099I9iSE,6871
|
|
106
109
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
|
-
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=
|
|
110
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS3-z03YW8nafooFJ7x60e5uEpBO5z_c7zk8,2450
|
|
108
111
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
109
|
-
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=
|
|
112
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
110
113
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
111
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
114
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=ELDdS5T3JZeSWVv1ldawcHzLwAljEWKqakbRMVcBvgw,4741
|
|
112
115
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
116
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
114
|
-
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=
|
|
117
|
+
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
115
118
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
116
119
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
117
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
120
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
|
|
118
121
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
119
122
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
120
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
123
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=QYZZuxbjkKxAjxuoWn0M5WgusO55vzeAcyKnWUMow3M,5871
|
|
121
124
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
122
125
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
123
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
|
|
124
127
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
128
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
|
|
126
129
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
127
130
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
128
131
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
129
132
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
130
133
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
|
-
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=
|
|
134
|
+
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
|
|
135
|
+
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
136
|
+
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
|
|
137
|
+
evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
|
|
138
|
+
evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
|
|
139
|
+
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweLG465JFgUzP20QlKyBAO90oFHhH7Z77FuUY,3521
|
|
140
|
+
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
141
|
+
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
142
|
+
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
143
|
+
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=EBe0XzY3B4cW5dCjwLksW7o4R1chZwsuFjxkfqVPFI4,28238
|
|
132
144
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
|
-
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=
|
|
145
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
134
146
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
135
147
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
136
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
148
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmuVhjujQAm4po4,11662
|
|
137
149
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
138
150
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
139
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
151
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
|
|
140
152
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
141
|
-
evalscope/benchmarks/musr/musr_adapter.py,sha256=
|
|
153
|
+
evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
|
|
142
154
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
143
155
|
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
144
|
-
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=
|
|
156
|
+
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
|
|
145
157
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
146
158
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
147
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
159
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
148
160
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
161
|
+
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=SrK18xDe4HyUaIPRLVEDtoF4Nc_ms4aFxktEsj8MnnA,9071
|
|
163
|
+
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
|
+
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
165
|
+
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
166
|
+
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
167
|
+
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
149
168
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
150
169
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
151
170
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
152
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
171
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb5BSyjO2eD4On6gX8xqlkV8,4961
|
|
153
172
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
154
173
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
155
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
174
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
|
|
156
175
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
157
176
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
158
177
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
@@ -161,40 +180,42 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
161
180
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
162
181
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
163
182
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
164
|
-
evalscope/collections/evaluator.py,sha256=
|
|
183
|
+
evalscope/collections/evaluator.py,sha256=okP4_a5vuM-Z0O_4ntauuyn2NeH228JUo_YrbrTqKPM,12741
|
|
165
184
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
166
185
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
167
186
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
168
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
187
|
+
evalscope/evaluator/evaluator.py,sha256=yj7ds5WMYqQcRw3B3x11-cajl4DmWsLM_3kO1n2k7OE,19734
|
|
169
188
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
170
189
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
171
190
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
172
191
|
evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
|
|
173
192
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
193
|
+
evalscope/metrics/llm_judge.py,sha256=g9pLMJPNTUyw0sGteblws1_e_KzbRqcbqKcaIzfE_DE,4031
|
|
174
194
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
175
195
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
176
|
-
evalscope/metrics/named_metrics.py,sha256=
|
|
196
|
+
evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
|
|
177
197
|
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
178
198
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
179
199
|
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
|
|
180
200
|
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
|
|
181
201
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
182
|
-
evalscope/models/__init__.py,sha256=
|
|
183
|
-
evalscope/models/base_adapter.py,sha256=
|
|
184
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
185
|
-
evalscope/models/choice_adapter.py,sha256=
|
|
186
|
-
evalscope/models/custom_adapter.py,sha256=
|
|
202
|
+
evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
|
|
203
|
+
evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
|
|
204
|
+
evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
|
|
205
|
+
evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
|
|
206
|
+
evalscope/models/custom_adapter.py,sha256=Za52WF1I_YcJkGomJ6s9sP2Fs8DoJ4HHBYBi3iC3WNI,2379
|
|
187
207
|
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
188
208
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
189
|
-
evalscope/models/
|
|
209
|
+
evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
|
|
210
|
+
evalscope/models/server_adapter.py,sha256=dS_o9_iC8QY73AehIekYwBQieFECZ97JRfbfleJ-Dtk,6845
|
|
190
211
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
191
212
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
192
213
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
193
214
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
194
|
-
evalscope/perf/arguments.py,sha256=
|
|
195
|
-
evalscope/perf/benchmark.py,sha256=
|
|
196
|
-
evalscope/perf/http_client.py,sha256=
|
|
197
|
-
evalscope/perf/main.py,sha256=
|
|
215
|
+
evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
|
|
216
|
+
evalscope/perf/benchmark.py,sha256=hKN-Nu-x-VTswHP0M6PT3jvduWxN7AJpz34DBrUcafQ,9734
|
|
217
|
+
evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
|
|
218
|
+
evalscope/perf/main.py,sha256=aZUrfbz-Pl2xe8AgUL_6rW6n8dX4YAToDw5xPpLtbI4,1278
|
|
198
219
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
199
220
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
200
221
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -211,7 +232,7 @@ evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYF
|
|
|
211
232
|
evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
|
|
212
233
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
213
234
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
214
|
-
evalscope/perf/utils/analysis_result.py,sha256=
|
|
235
|
+
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
215
236
|
evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
|
|
216
237
|
evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
|
|
217
238
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
@@ -238,8 +259,8 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
238
259
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
239
260
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
240
261
|
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
241
|
-
evalscope/report/app.py,sha256=
|
|
242
|
-
evalscope/report/combinator.py,sha256=
|
|
262
|
+
evalscope/report/app.py,sha256=cvof2Nm4ORxC4D3L22Kg3Ngu3kJwBZlfnFJkwDMCmSQ,26881
|
|
263
|
+
evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
|
|
243
264
|
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
244
265
|
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
245
266
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -248,7 +269,7 @@ evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u
|
|
|
248
269
|
evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
|
|
249
270
|
evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
|
|
250
271
|
evalscope/third_party/longbench_write/eval.py,sha256=39McZSDHL7bA5Dg-BSyZ4EiAF1nfTiYJAnx5FqbNYok,11265
|
|
251
|
-
evalscope/third_party/longbench_write/infer.py,sha256=
|
|
272
|
+
evalscope/third_party/longbench_write/infer.py,sha256=32t90zTll6SXH7Wx8QnRFMs6ZUwvpbgYNuawCByzwR0,4971
|
|
252
273
|
evalscope/third_party/longbench_write/longbench_write.py,sha256=nIR1toB1hvUXR7Lrs3xcY9wqaI-bjeADg_Oscf3HdaY,3991
|
|
253
274
|
evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
|
|
254
275
|
evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -260,12 +281,12 @@ evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc
|
|
|
260
281
|
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
|
|
261
282
|
evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
|
|
262
283
|
evalscope/third_party/thinkbench/__init__.py,sha256=C0aSu71_dc1upUVkKmq2VgDd9plpRcYUdCE6BjUWJcA,110
|
|
263
|
-
evalscope/third_party/thinkbench/eval.py,sha256=
|
|
264
|
-
evalscope/third_party/thinkbench/infer.py,sha256
|
|
284
|
+
evalscope/third_party/thinkbench/eval.py,sha256=76G4LTkxqWCDCyj7Ahjj-qjO1gFem1uDzpRAC27ICl0,18896
|
|
285
|
+
evalscope/third_party/thinkbench/infer.py,sha256=2L4DAJKn3wAhNEKnKudQT60igGOJSKH80FR4nS7DHYk,3952
|
|
265
286
|
evalscope/third_party/thinkbench/resources/critique_template.txt,sha256=d4Egc-qH--4lG8X_EcmgymnuZgiCMbee1M5pt4HrRKA,535
|
|
266
287
|
evalscope/third_party/thinkbench/resources/reformat_template.txt,sha256=zTZyVAzmMBtAwI9lHly9EXsqX471OW-VTg538PDcB30,1775
|
|
267
288
|
evalscope/third_party/thinkbench/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
268
|
-
evalscope/third_party/thinkbench/tools/llm.py,sha256=
|
|
289
|
+
evalscope/third_party/thinkbench/tools/llm.py,sha256=HCFh58_THsVrFVzvGoThwWRu8EbPXD0DotLQEj5u4Tg,1353
|
|
269
290
|
evalscope/third_party/thinkbench/tools/utils.py,sha256=rDu2GVTK4ji9Yh9RLVksZqrfurQsSuN9GW3QCKJ60ng,401
|
|
270
291
|
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
271
292
|
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
@@ -276,20 +297,22 @@ evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo
|
|
|
276
297
|
evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
|
|
277
298
|
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
|
|
278
299
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
279
|
-
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=
|
|
300
|
+
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
|
|
280
301
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
281
302
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
282
|
-
evalscope/utils/chat_service.py,sha256=
|
|
303
|
+
evalscope/utils/chat_service.py,sha256=9LNTT-8KsacOLqnQer8j57e224rwOMbU7txV6re-X-A,8720
|
|
283
304
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
305
|
+
evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
284
306
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
285
|
-
evalscope/utils/logger.py,sha256=
|
|
307
|
+
evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
|
|
286
308
|
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
287
309
|
evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
|
|
288
310
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
289
311
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
290
312
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
291
|
-
tests/cli/
|
|
292
|
-
tests/cli/
|
|
313
|
+
tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
|
|
314
|
+
tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
|
|
315
|
+
tests/cli/test_run.py,sha256=LKWWxT0jaMLtcIl57vnXEFFlzbJpAplFqqwinvAHN8Y,15047
|
|
293
316
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
294
317
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
295
318
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -302,9 +325,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
302
325
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
303
326
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
304
327
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
305
|
-
evalscope-0.
|
|
306
|
-
evalscope-0.
|
|
307
|
-
evalscope-0.
|
|
308
|
-
evalscope-0.
|
|
309
|
-
evalscope-0.
|
|
310
|
-
evalscope-0.
|
|
328
|
+
evalscope-0.13.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
329
|
+
evalscope-0.13.0.dist-info/METADATA,sha256=0i3SENci2ws_vqdewQAxVUqan-MV1LwJoLLcEZ8ML7w,32870
|
|
330
|
+
evalscope-0.13.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
331
|
+
evalscope-0.13.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
332
|
+
evalscope-0.13.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
333
|
+
evalscope-0.13.0.dist-info/RECORD,,
|
tests/cli/test_all.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
import unittest
|
|
9
|
+
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
12
|
+
from evalscope.run import run_task
|
|
13
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
os.environ['LOG_LEVEL'] = 'DEBUG'
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
datasets=[
|
|
21
|
+
# 'iquiz',
|
|
22
|
+
# 'ifeval',
|
|
23
|
+
# 'mmlu',
|
|
24
|
+
# 'mmlu_pro',
|
|
25
|
+
# 'musr',
|
|
26
|
+
# 'process_bench',
|
|
27
|
+
# 'race',
|
|
28
|
+
# 'trivia_qa',
|
|
29
|
+
# 'cmmlu',
|
|
30
|
+
# 'humaneval',
|
|
31
|
+
# 'gsm8k',
|
|
32
|
+
# 'bbh',
|
|
33
|
+
# 'competition_math',
|
|
34
|
+
# 'math_500',
|
|
35
|
+
# 'aime24',
|
|
36
|
+
# 'gpqa',
|
|
37
|
+
# 'arc',
|
|
38
|
+
# 'ceval',
|
|
39
|
+
# 'hellaswag',
|
|
40
|
+
# 'general_mcq',
|
|
41
|
+
# 'general_qa',
|
|
42
|
+
'super_gpqa',
|
|
43
|
+
'live_code_bench',
|
|
44
|
+
'simple_qa',
|
|
45
|
+
'chinese_simpleqa',
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
dataset_args={
|
|
49
|
+
'mmlu': {
|
|
50
|
+
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
51
|
+
'few_shot_num': 0
|
|
52
|
+
},
|
|
53
|
+
'mmlu_pro': {
|
|
54
|
+
'subset_list': ['math', 'health'],
|
|
55
|
+
'few_shot_num': 4
|
|
56
|
+
},
|
|
57
|
+
'ceval': {
|
|
58
|
+
'subset_list': [
|
|
59
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
60
|
+
],
|
|
61
|
+
'few_shot_num': 0
|
|
62
|
+
},
|
|
63
|
+
'cmmlu': {
|
|
64
|
+
'subset_list': ['elementary_chinese'],
|
|
65
|
+
'few_shot_num': 0
|
|
66
|
+
},
|
|
67
|
+
'bbh': {
|
|
68
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
69
|
+
},
|
|
70
|
+
'gpqa': {
|
|
71
|
+
'subset_list': ['gpqa_diamond'],
|
|
72
|
+
'few_shot_num': 0,
|
|
73
|
+
},
|
|
74
|
+
'humaneval': {
|
|
75
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
76
|
+
},
|
|
77
|
+
'competition_math': {
|
|
78
|
+
'subset_list': ['Level 1']
|
|
79
|
+
},
|
|
80
|
+
'math_500': {
|
|
81
|
+
'subset_list': ['Level 1']
|
|
82
|
+
},
|
|
83
|
+
'process_bench': {
|
|
84
|
+
'subset_list': ['gsm8k'],
|
|
85
|
+
},
|
|
86
|
+
'musr': {
|
|
87
|
+
'subset_list': ['murder_mysteries']
|
|
88
|
+
},
|
|
89
|
+
'general_mcq': {
|
|
90
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
91
|
+
'subset_list': [
|
|
92
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
93
|
+
],
|
|
94
|
+
},
|
|
95
|
+
'general_qa': {
|
|
96
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
97
|
+
'subset_list': [
|
|
98
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
99
|
+
# 'test'
|
|
100
|
+
],
|
|
101
|
+
'metric_list': ['AverageBLEU']
|
|
102
|
+
},
|
|
103
|
+
'super_gpqa': {
|
|
104
|
+
'subset_list': ['Philosophy', 'Education'],
|
|
105
|
+
'few_shot_num': 0
|
|
106
|
+
},
|
|
107
|
+
'live_code_bench': {
|
|
108
|
+
'subset_list': ['v4_v5'],
|
|
109
|
+
'extra_params': {
|
|
110
|
+
'start_date': '2024-12-01',
|
|
111
|
+
'end_date': '2025-01-01'
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
class TestRun(unittest.TestCase):
|
|
117
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
118
|
+
def test_benchmarks(self):
|
|
119
|
+
from evalscope.config import TaskConfig
|
|
120
|
+
|
|
121
|
+
task_cfg = TaskConfig(
|
|
122
|
+
model='qwen2.5-7b-instruct',
|
|
123
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
124
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
125
|
+
eval_type=EvalType.SERVICE,
|
|
126
|
+
datasets=datasets,
|
|
127
|
+
dataset_args=dataset_args,
|
|
128
|
+
eval_batch_size=32,
|
|
129
|
+
limit=2,
|
|
130
|
+
stream=True,
|
|
131
|
+
generation_config={
|
|
132
|
+
'temperature': 0,
|
|
133
|
+
'n': 1,
|
|
134
|
+
'max_tokens': 4096,
|
|
135
|
+
},
|
|
136
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
137
|
+
judge_model_args={
|
|
138
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
139
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
140
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
141
|
+
}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_collection.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import unittest
|
|
3
4
|
|
|
4
5
|
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
|
5
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, JudgeStrategy
|
|
6
7
|
from evalscope.utils.io_utils import dump_jsonl_data
|
|
7
8
|
from evalscope.utils.utils import test_level_list
|
|
8
9
|
|
|
@@ -44,7 +45,7 @@ class TestCollection(unittest.TestCase):
|
|
|
44
45
|
from evalscope import TaskConfig, run_task
|
|
45
46
|
|
|
46
47
|
task_cfg = TaskConfig(
|
|
47
|
-
model='Qwen2.5-
|
|
48
|
+
model='Qwen2.5-0.5B-Instruct',
|
|
48
49
|
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
49
50
|
api_key='EMPTY',
|
|
50
51
|
eval_type=EvalType.SERVICE,
|
|
@@ -55,3 +56,28 @@ class TestCollection(unittest.TestCase):
|
|
|
55
56
|
}},
|
|
56
57
|
)
|
|
57
58
|
run_task(task_cfg=task_cfg)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
62
|
+
def test_evaluate_collection_with_judge(self):
|
|
63
|
+
from evalscope import TaskConfig, run_task
|
|
64
|
+
|
|
65
|
+
task_cfg = TaskConfig(
|
|
66
|
+
model='qwen2.5-7b-instruct',
|
|
67
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
68
|
+
api_key= os.getenv('DASHSCOPE_API_KEY'),
|
|
69
|
+
eval_type=EvalType.SERVICE,
|
|
70
|
+
datasets=['data_collection'],
|
|
71
|
+
dataset_args={'data_collection': {
|
|
72
|
+
'local_path': 'outputs/mixed_data_test.jsonl'
|
|
73
|
+
# 'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
74
|
+
}},
|
|
75
|
+
limit=10,
|
|
76
|
+
judge_strategy=JudgeStrategy.LLM_RECALL,
|
|
77
|
+
judge_model_args={
|
|
78
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
79
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
+
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
run_task(task_cfg=task_cfg)
|