evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +67 -59
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +12 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +101 -18
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/utils.py +28 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +94 -32
- evalscope/config.py +54 -17
- evalscope/evaluator/evaluator.py +80 -41
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +15 -8
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +17 -25
- evalscope/perf/arguments.py +16 -7
- evalscope/perf/benchmark.py +0 -15
- evalscope/perf/main.py +72 -15
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +34 -16
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +61 -4
- evalscope/run.py +12 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -32
- tests/cli/test_collection.py +8 -6
- tests/cli/test_run.py +43 -17
- tests/perf/test_perf.py +23 -0
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.16.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -17,12 +17,12 @@ Requires-Python: >=3.8
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: accelerate
|
|
20
|
-
Requires-Dist: datasets
|
|
20
|
+
Requires-Dist: datasets>=3.0
|
|
21
21
|
Requires-Dist: immutabledict
|
|
22
22
|
Requires-Dist: jieba
|
|
23
23
|
Requires-Dist: jsonlines
|
|
24
24
|
Requires-Dist: langdetect
|
|
25
|
-
Requires-Dist: latex2sympy2
|
|
25
|
+
Requires-Dist: latex2sympy2-extended
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
@@ -52,12 +52,12 @@ Requires-Dist: open-clip-torch; extra == "aigc"
|
|
|
52
52
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
53
53
|
Provides-Extra: all
|
|
54
54
|
Requires-Dist: accelerate; extra == "all"
|
|
55
|
-
Requires-Dist: datasets
|
|
55
|
+
Requires-Dist: datasets>=3.0; extra == "all"
|
|
56
56
|
Requires-Dist: immutabledict; extra == "all"
|
|
57
57
|
Requires-Dist: jieba; extra == "all"
|
|
58
58
|
Requires-Dist: jsonlines; extra == "all"
|
|
59
59
|
Requires-Dist: langdetect; extra == "all"
|
|
60
|
-
Requires-Dist: latex2sympy2; extra == "all"
|
|
60
|
+
Requires-Dist: latex2sympy2-extended; extra == "all"
|
|
61
61
|
Requires-Dist: matplotlib; extra == "all"
|
|
62
62
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
63
63
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -79,21 +79,22 @@ Requires-Dist: torchvision; extra == "all"
|
|
|
79
79
|
Requires-Dist: tqdm; extra == "all"
|
|
80
80
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
81
81
|
Requires-Dist: word2number; extra == "all"
|
|
82
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
83
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
82
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
|
|
83
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
|
|
84
84
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
85
85
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
86
86
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
87
87
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
88
|
-
Requires-Dist: mteb==1.
|
|
88
|
+
Requires-Dist: mteb==1.38.20; extra == "all"
|
|
89
89
|
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
90
90
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
91
91
|
Requires-Dist: aiohttp; extra == "all"
|
|
92
92
|
Requires-Dist: fastapi; extra == "all"
|
|
93
93
|
Requires-Dist: numpy; extra == "all"
|
|
94
|
+
Requires-Dist: rich; extra == "all"
|
|
94
95
|
Requires-Dist: sse-starlette; extra == "all"
|
|
95
96
|
Requires-Dist: transformers; extra == "all"
|
|
96
|
-
Requires-Dist:
|
|
97
|
+
Requires-Dist: uvicorn; extra == "all"
|
|
97
98
|
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
98
99
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
99
100
|
Requires-Dist: diffusers; extra == "all"
|
|
@@ -105,24 +106,25 @@ Provides-Extra: app
|
|
|
105
106
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
106
107
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
107
108
|
Provides-Extra: opencompass
|
|
108
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
109
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
109
110
|
Provides-Extra: perf
|
|
110
111
|
Requires-Dist: aiohttp; extra == "perf"
|
|
111
112
|
Requires-Dist: fastapi; extra == "perf"
|
|
112
113
|
Requires-Dist: numpy; extra == "perf"
|
|
114
|
+
Requires-Dist: rich; extra == "perf"
|
|
113
115
|
Requires-Dist: sse-starlette; extra == "perf"
|
|
114
116
|
Requires-Dist: transformers; extra == "perf"
|
|
115
|
-
Requires-Dist:
|
|
117
|
+
Requires-Dist: uvicorn; extra == "perf"
|
|
116
118
|
Provides-Extra: rag
|
|
117
119
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
118
120
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
119
121
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
120
122
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
121
|
-
Requires-Dist: mteb==1.
|
|
123
|
+
Requires-Dist: mteb==1.38.20; extra == "rag"
|
|
122
124
|
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
123
125
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
124
126
|
Provides-Extra: vlmeval
|
|
125
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
127
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
126
128
|
|
|
127
129
|
<p align="center">
|
|
128
130
|
<br>
|
|
@@ -177,9 +179,23 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
177
179
|
|
|
178
180
|
## 📝 Introduction
|
|
179
181
|
|
|
180
|
-
EvalScope is [ModelScope](https://modelscope.cn/)
|
|
182
|
+
EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
|
|
181
183
|
|
|
182
|
-
|
|
184
|
+
- 🧠 Large Language Models
|
|
185
|
+
- 🎨 Multimodal Models
|
|
186
|
+
- 🔍 Embedding Models
|
|
187
|
+
- 🏆 Reranker Models
|
|
188
|
+
- 🖼️ CLIP Models
|
|
189
|
+
- 🎭 AIGC Models (Image-to-Text/Video)
|
|
190
|
+
- ...and more!
|
|
191
|
+
|
|
192
|
+
EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
|
|
193
|
+
|
|
194
|
+
- 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
|
|
195
|
+
- 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
|
|
196
|
+
- 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
|
|
197
|
+
|
|
198
|
+
Below is the overall architecture diagram of EvalScope:
|
|
183
199
|
|
|
184
200
|
<p align="center">
|
|
185
201
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
@@ -214,6 +230,10 @@ Please scan the QR code below to join our community groups:
|
|
|
214
230
|
|
|
215
231
|
## 🎉 News
|
|
216
232
|
|
|
233
|
+
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
234
|
+
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
235
|
+
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
236
|
+
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
217
237
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
218
238
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
219
239
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
@@ -479,26 +499,27 @@ For more customized evaluations, such as customizing model parameters or dataset
|
|
|
479
499
|
|
|
480
500
|
```shell
|
|
481
501
|
evalscope eval \
|
|
482
|
-
--model Qwen/
|
|
483
|
-
--model-args revision
|
|
484
|
-
--generation-config do_sample
|
|
502
|
+
--model Qwen/Qwen3-0.6B \
|
|
503
|
+
--model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
|
|
504
|
+
--generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
|
|
485
505
|
--dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
|
|
486
506
|
--datasets gsm8k \
|
|
487
507
|
--limit 10
|
|
488
508
|
```
|
|
489
509
|
|
|
490
|
-
### Parameter
|
|
491
|
-
- `--model-args`: Model loading parameters,
|
|
492
|
-
- `revision`: Model version
|
|
493
|
-
- `precision`: Model precision
|
|
494
|
-
- `device_map`:
|
|
495
|
-
- `--generation-config`: Generation parameters,
|
|
496
|
-
- `do_sample`: Whether to use sampling
|
|
497
|
-
- `
|
|
498
|
-
- `max_new_tokens`: Maximum length of
|
|
499
|
-
-
|
|
510
|
+
### Parameter Description
|
|
511
|
+
- `--model-args`: Model loading parameters, passed as a JSON string:
|
|
512
|
+
- `revision`: Model version
|
|
513
|
+
- `precision`: Model precision
|
|
514
|
+
- `device_map`: Device allocation for the model
|
|
515
|
+
- `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
|
|
516
|
+
- `do_sample`: Whether to use sampling
|
|
517
|
+
- `temperature`: Generation temperature
|
|
518
|
+
- `max_new_tokens`: Maximum length of generated tokens
|
|
519
|
+
- `chat_template_kwargs`: Model inference template parameters
|
|
520
|
+
- `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
|
|
500
521
|
- `few_shot_num`: Number of few-shot examples
|
|
501
|
-
- `few_shot_random`: Whether to randomly sample few-shot data
|
|
522
|
+
- `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
|
|
502
523
|
|
|
503
524
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
504
525
|
|
|
@@ -517,6 +538,11 @@ A stress testing tool focused on large language models, which can be customized
|
|
|
517
538
|
|
|
518
539
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
519
540
|
|
|
541
|
+
**Output example**
|
|
542
|
+
|
|
543
|
+

|
|
544
|
+
|
|
545
|
+
|
|
520
546
|
**Supports wandb for recording results**
|
|
521
547
|
|
|
522
548
|

|
|
@@ -565,7 +591,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
565
591
|
</a>
|
|
566
592
|
|
|
567
593
|
## 🔜 Roadmap
|
|
568
|
-
- [
|
|
594
|
+
- [x] Support for better evaluation report visualization
|
|
569
595
|
- [x] Support for mixed evaluations across multiple datasets
|
|
570
596
|
- [x] RAG evaluation
|
|
571
597
|
- [x] VLM evaluation
|
|
@@ -575,7 +601,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
575
601
|
- [x] Multi-modal evaluation
|
|
576
602
|
- [ ] Benchmarks
|
|
577
603
|
- [ ] GAIA
|
|
578
|
-
- [
|
|
604
|
+
- [x] GPQA
|
|
579
605
|
- [x] MBPP
|
|
580
606
|
|
|
581
607
|
|
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
|
|
3
|
+
evalscope/config.py,sha256=HGvIlhjVjA9QtAiNEUrx_hev3wa-RaNEXelEiLJn9OM,11015
|
|
4
4
|
evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=saHZGlwbBLYtFk4BmKkjQEOOHQQ-pDKzN21taao6Os0,6957
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=vMuGTezikPNdTLYlejHdHznB5WhuHCnAhaOdw3iqU5E,119
|
|
9
|
+
evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
|
|
10
|
+
evalscope/app/app.py,sha256=sTYoc3Uag7DqYbb_qXo8QJX4oer8dueQK1wdgaLlTiY,29371
|
|
11
|
+
evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
|
|
12
|
+
evalscope/app/constants.py,sha256=KpItEl9lF0VldOm0grjS7RVbbseemtsXZJKtgGmAQB8,361
|
|
9
13
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
14
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
15
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
12
16
|
evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
|
|
13
|
-
evalscope/backend/opencompass/backend_manager.py,sha256=
|
|
17
|
+
evalscope/backend/opencompass/backend_manager.py,sha256=kIPzirjAOW0_YNQiCrhjRfAVD3UpcGmr4RXBH-WMH0Y,10409
|
|
14
18
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
15
19
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
|
|
16
20
|
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
|
|
@@ -27,12 +31,12 @@ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0U
|
|
|
27
31
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
|
|
28
32
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
|
|
29
33
|
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
|
|
30
|
-
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=
|
|
34
|
+
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=xROhoVxJvMhhU9S5SKtiavQHM447esbrVWlbmes4AVI,2814
|
|
31
35
|
evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
|
|
32
|
-
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=
|
|
36
|
+
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=vPfbBvtVjX6U6QHEG5mRP9CQjFMF-_8EdrpYoNHbDFU,3303
|
|
33
37
|
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
|
|
34
38
|
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=-GTwORxILSkkXXGtTxuPTKSHNXQEllCRoUjuR7pnwFM,8962
|
|
35
|
-
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=
|
|
39
|
+
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=_uuDPaerh6qbxw7W3DiPrWuxfEyLeKCHeduYcp-1Veg,2025
|
|
36
40
|
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=yISp67pXw4fSrsqTiYmfas6uPyqwE45L1c58Tpydc0E,4075
|
|
37
41
|
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=AH7jwJ45WAVxVb60I2DTURVanIAbrlZzk-ey_dHWEO0,5491
|
|
38
42
|
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9GUpuEEmcWwc78Q7ZJjRDZs,11454
|
|
@@ -49,15 +53,15 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8m
|
|
|
49
53
|
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
|
|
50
54
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
55
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
|
-
evalscope/backend/rag_eval/utils/embedding.py,sha256=
|
|
53
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
56
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=3CkLX6SXGAc6ltUQe4V_IcTr71cZSane5-VjaRYn13M,9466
|
|
57
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
|
|
54
58
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
59
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
60
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
|
|
57
61
|
evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
|
|
58
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
59
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
60
|
-
evalscope/benchmarks/utils.py,sha256=
|
|
62
|
+
evalscope/benchmarks/benchmark.py,sha256=X-vBzz5PDVI5rBbqWpiUZq0bmGhp9cRZiA27XCgxPdE,2573
|
|
63
|
+
evalscope/benchmarks/data_adapter.py,sha256=Z2s4mfJssxNAeFPVNgZLkBbc3DBbJRZNGbRBigLe4I4,22893
|
|
64
|
+
evalscope/benchmarks/utils.py,sha256=81MwUJYWjJgoiRClY-IFB-EZN0th-oQDTvU2ekaEmpc,1869
|
|
61
65
|
evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
66
|
evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
67
|
evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
|
|
@@ -70,7 +74,7 @@ evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
70
74
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=GrIxCHpUwgUy8tXGTB7iQOt8k7wG8MJB0CWbwBmIy-8,1703
|
|
71
75
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=yxo5roCb8ryX9ROUU2FdZ-WBTUPZ14MrBzEL0zPOh-U,1718
|
|
72
76
|
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
|
-
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=
|
|
77
|
+
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=oUHpWrt5Gx0jF80RBd7zTh_1AWI66YvDd6U1vOMoqj0,3828
|
|
74
78
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
75
79
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
76
80
|
evalscope/benchmarks/arc/arc_adapter.py,sha256=0h-eT4BBmUJQrakKMPUNE1nSRwK6LHB-cflWpWzY978,6364
|
|
@@ -110,7 +114,7 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
|
|
|
110
114
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1ITBXI0f01Dt1p7sb2RGswIeg9685Bkk2S2xmA1vat8,11295
|
|
111
115
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
112
116
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
|
-
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=
|
|
117
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=Q6ncuLrCUrrhhljIfMsgWnyhHfcWWwh8iA6NZvz3W28,8079
|
|
114
118
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
115
119
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
116
120
|
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=r9zael_Y2Jso0ashevYpF8e5SHOBh8iMcPIJU5WT3pQ,10367
|
|
@@ -120,10 +124,19 @@ evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc
|
|
|
120
124
|
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=wgejW-_QswtT8_3JKAQ_H6svH8IotDJDBEH7X4nP4bY,6760
|
|
121
125
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
122
126
|
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3q53T-lu1UWTV6T4h1cKGoCQDh0O4QxFezw,2569
|
|
127
|
+
evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
+
evalscope/benchmarks/docmath/docmath_adapter.py,sha256=GAoHuFASKyWCVbB0nmImsEB-YCREwB75WjdqYB0CcyU,2912
|
|
129
|
+
evalscope/benchmarks/docmath/utils.py,sha256=ptd-Sot4QtUmUG4dMlqXtUWHKZplo5jSTolsypqX9Ho,7716
|
|
130
|
+
evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
|
+
evalscope/benchmarks/drop/drop_adapter.py,sha256=V-Vx6g2_1kcDUDWOKVX1vPSLt5iHn8NQkpWbsIwPaa4,8325
|
|
132
|
+
evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
|
|
133
|
+
evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
|
+
evalscope/benchmarks/frames/frames_adapter.py,sha256=wbug6yDlq6N5SfCQaOn43K8klJjrZc9iigFEPQs5nKA,3096
|
|
135
|
+
evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
|
|
123
136
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
137
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
|
|
125
138
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
126
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
139
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=40mZovspVf-OXcuEu3ei6G_HZlYA8whAHSESHPPONxA,4750
|
|
127
140
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
141
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
129
142
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -166,6 +179,9 @@ evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
166
179
|
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=Kr30i_exxBJRz9PLB5g6F04e2HJ4WuF6LDyAwaRh2MY,9578
|
|
167
180
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
181
|
evalscope/benchmarks/musr/musr_adapter.py,sha256=85P0sY7H9pthYdCjkE2AOxaiNhcIBW1iZmODkz3FN0M,2464
|
|
182
|
+
evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
183
|
+
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=rNi7ULskhhHh1eVN1eV15gyLVFE05uertlZlCzMzgOE,15355
|
|
184
|
+
evalscope/benchmarks/needle_haystack/utils.py,sha256=bDwtpMS7Eqr63urCttS9i3BvT_aPuNvrQU-vEc6tcx0,2911
|
|
169
185
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
170
186
|
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
171
187
|
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
|
|
@@ -174,12 +190,15 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
|
|
|
174
190
|
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
175
191
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
176
192
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
177
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
193
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=TD7hkMLGZ4GK7wD7cwqJ3jCcTAaixOakUy3o5DaPYHI,8997
|
|
178
194
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
195
|
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
180
196
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
181
197
|
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
182
198
|
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
199
|
+
evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
200
|
+
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=c8_Cok_wctlBtWd7kDQY9McaFbkWsW9LTC5JzPpef-Q,2399
|
|
201
|
+
evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
|
|
183
202
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
184
203
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
185
204
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
@@ -187,30 +206,32 @@ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb
|
|
|
187
206
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
188
207
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
189
208
|
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
|
|
209
|
+
evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
210
|
+
evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=UdANz3YmCtV2YfGuEihTe3vpUTlIxeXBhIqGkKbTFdU,1956
|
|
190
211
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
191
212
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
192
213
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
193
|
-
evalscope/cli/start_app.py,sha256=
|
|
214
|
+
evalscope/cli/start_app.py,sha256=dV63nvBYEUl2sGeVxoUH4IJBXJSLecaq293i3alBWxo,794
|
|
194
215
|
evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,775
|
|
195
216
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
196
217
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
197
|
-
evalscope/collections/__init__.py,sha256=
|
|
198
|
-
evalscope/collections/evaluator.py,sha256=
|
|
218
|
+
evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCeR2Io,853
|
|
219
|
+
evalscope/collections/evaluator.py,sha256=NnLel9lOyR0wzOwxDGSCFWJN4zFx9ZA2hc0PI-FSvl0,16200
|
|
199
220
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
200
221
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
201
222
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
202
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
223
|
+
evalscope/evaluator/evaluator.py,sha256=d8cFq08oJ6kbKcwr4mVh517OxndgyqUrmuEP-bwmR6g,22071
|
|
203
224
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
204
225
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
205
226
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
|
|
206
|
-
evalscope/metrics/__init__.py,sha256=
|
|
207
|
-
evalscope/metrics/llm_judge.py,sha256=
|
|
208
|
-
evalscope/metrics/math_parser.py,sha256=
|
|
227
|
+
evalscope/metrics/__init__.py,sha256=g96dZSt3Dh56TdVbe4yDqcfmr9DoLqH-R2__3Qvorjk,1497
|
|
228
|
+
evalscope/metrics/llm_judge.py,sha256=O2IaJpsBe1HqfCVnRYOt_PLWg6w85DYlYLU7yTq5idw,4384
|
|
229
|
+
evalscope/metrics/math_parser.py,sha256=JtOkj28XOtwoUACXOXLzCeRYz0rx0tBsQLQDU8cbC20,17311
|
|
209
230
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
210
231
|
evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
|
|
211
|
-
evalscope/metrics/rouge_metric.py,sha256=
|
|
232
|
+
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
212
233
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
213
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
234
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=T91PgJfi1As7BR7I-Hq6rLlvHAtMB9JpBw9gMTH8VlE,12114
|
|
214
235
|
evalscope/metrics/t2v_metrics/__init__.py,sha256=GBxgKTPVy_qhW_F3M4Oi6QMWhdAi4PqGX5w3t6Tueho,1783
|
|
215
236
|
evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
|
|
216
237
|
evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
|
|
@@ -318,19 +339,19 @@ evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,630
|
|
|
318
339
|
evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
|
|
319
340
|
evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
|
|
320
341
|
evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
|
|
321
|
-
evalscope/models/adapters/chat_adapter.py,sha256=
|
|
342
|
+
evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
|
|
322
343
|
evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
|
|
323
344
|
evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
|
|
324
|
-
evalscope/models/adapters/server_adapter.py,sha256=
|
|
345
|
+
evalscope/models/adapters/server_adapter.py,sha256=qdonCJLoM0qmFQtHziczUqVzA31p4AxIn2j9oNIosLw,6493
|
|
325
346
|
evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
|
|
326
347
|
evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
|
|
327
348
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
328
349
|
evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
|
|
329
350
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
330
|
-
evalscope/perf/arguments.py,sha256=
|
|
331
|
-
evalscope/perf/benchmark.py,sha256=
|
|
351
|
+
evalscope/perf/arguments.py,sha256=HUKzcU-FBt34DgGJ0nc5rNgJAMpZwYQXMz8VU8jokco,10668
|
|
352
|
+
evalscope/perf/benchmark.py,sha256=qEgIX_Z4x3FNtAKTMlP2mRJTerRV5seCbVtB4XklnQI,7566
|
|
332
353
|
evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
|
|
333
|
-
evalscope/perf/main.py,sha256=
|
|
354
|
+
evalscope/perf/main.py,sha256=yfJWGd2l4uU_qKW9bD6DzV0DK9XXuCJGLYjF_JWR22E,3394
|
|
334
355
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
335
356
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
336
357
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -340,7 +361,7 @@ evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4I
|
|
|
340
361
|
evalscope/perf/plugin/api/openai_api.py,sha256=kTL_2OACuKhzd2W0Pf4DirpMumzk4V3rqKZ2mvBZVCs,7655
|
|
341
362
|
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
342
363
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
343
|
-
evalscope/perf/plugin/datasets/custom.py,sha256
|
|
364
|
+
evalscope/perf/plugin/datasets/custom.py,sha256=-meul2hRmYvYAo--c_EtCnItRi5DvN7xxFOpq6vqdts,1346
|
|
344
365
|
evalscope/perf/plugin/datasets/flickr8k.py,sha256=MbJKEB0XqZE0nDEenwYs0FLH9QL658Vn9uQmUH4hPvk,1605
|
|
345
366
|
evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
|
|
346
367
|
evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
|
|
@@ -349,11 +370,12 @@ evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANB
|
|
|
349
370
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
350
371
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
351
372
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
352
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
353
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
373
|
+
evalscope/perf/utils/benchmark_util.py,sha256=EPKUDijue85b8KhSJoJKLh6comkTKRjq2yoEw4kxBho,7227
|
|
374
|
+
evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
|
|
354
375
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
355
|
-
evalscope/perf/utils/local_server.py,sha256=
|
|
356
|
-
evalscope/perf/utils/log_utils.py,sha256=
|
|
376
|
+
evalscope/perf/utils/local_server.py,sha256=RL9rGd5tEniZ0aErhHcbVXMX22YmujfE11T3j37VL8k,4684
|
|
377
|
+
evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
|
|
378
|
+
evalscope/perf/utils/rich_display.py,sha256=xZzeryQbYM6Cv8g1ulK6OQUE2CalQ_KtFxiy7pioeEU,8127
|
|
357
379
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
358
380
|
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
359
381
|
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
@@ -375,12 +397,10 @@ evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNi
|
|
|
375
397
|
evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
|
|
376
398
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
377
399
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
378
|
-
evalscope/report/__init__.py,sha256=
|
|
379
|
-
evalscope/report/
|
|
380
|
-
evalscope/report/
|
|
381
|
-
evalscope/report/
|
|
382
|
-
evalscope/report/generator.py,sha256=q9aHWNjQgvutAKtpjfWOpfu5zNFdnXilO9OqBqt_Phg,3612
|
|
383
|
-
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
400
|
+
evalscope/report/__init__.py,sha256=mLCgT7G-WPagQHOGz97AOdLQJjyikrswDiXA8d9Wr_Q,923
|
|
401
|
+
evalscope/report/combinator.py,sha256=xGX0B6tGZxaEB20tziPQm3HUkvgftghKg5AEQ8JpsBE,2842
|
|
402
|
+
evalscope/report/generator.py,sha256=oykmQROG-Bt8ttCH4RtvmGJ39HmDJMTU6gG26lg5LHE,4321
|
|
403
|
+
evalscope/report/utils.py,sha256=KAc4Cq8NMxTUjCJHI5MK3ZqzBNjfDMXrwLBpUkaywjk,6520
|
|
384
404
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
385
405
|
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
386
406
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
@@ -413,32 +433,33 @@ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2ee
|
|
|
413
433
|
evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
|
|
414
434
|
evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
|
|
415
435
|
evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
|
|
416
|
-
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=
|
|
436
|
+
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
|
|
417
437
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
418
438
|
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
|
|
419
439
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
420
440
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
421
441
|
evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
|
|
422
442
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
443
|
+
evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
|
|
423
444
|
evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
424
445
|
evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
|
|
425
446
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
426
|
-
evalscope/utils/logger.py,sha256=
|
|
447
|
+
evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
|
|
427
448
|
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
428
|
-
evalscope/utils/utils.py,sha256=
|
|
449
|
+
evalscope/utils/utils.py,sha256=P5gmpINv5UQrwEMrFZKZjdJspsOdGjaBARfRSDVNOd0,11414
|
|
429
450
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
430
451
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
431
452
|
tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
432
|
-
tests/aigc/test_t2i.py,sha256=
|
|
453
|
+
tests/aigc/test_t2i.py,sha256=YjEAwlM8cBfGCGOguz86UebJjJ5bsc3jhs4SQqyxwZs,3844
|
|
433
454
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
434
|
-
tests/cli/test_all.py,sha256=
|
|
435
|
-
tests/cli/test_collection.py,sha256=
|
|
436
|
-
tests/cli/test_run.py,sha256=
|
|
455
|
+
tests/cli/test_all.py,sha256=noGE54iWnmoPGTsN2PGh7_jM5ceehN6bMnp6xxq4s3A,4240
|
|
456
|
+
tests/cli/test_collection.py,sha256=H7enYWGTmp2VRio-WTEfPRdkf3y-T4fs43Kqf81mbrQ,4181
|
|
457
|
+
tests/cli/test_run.py,sha256=OER_I6FeJAMUA2IN0zKUdUIeRDr8mJFaOiEpwQjYbnE,18166
|
|
437
458
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
438
|
-
tests/perf/test_perf.py,sha256=
|
|
459
|
+
tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
|
|
439
460
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
440
461
|
tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
|
|
441
|
-
tests/rag/test_mteb.py,sha256=
|
|
462
|
+
tests/rag/test_mteb.py,sha256=PaWS5GrZdMO680M129QP2EG000rVq7f2iP3n0YDAv-w,5611
|
|
442
463
|
tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
|
|
443
464
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
444
465
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
@@ -446,9 +467,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
446
467
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
447
468
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
448
469
|
tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
|
|
449
|
-
evalscope-0.
|
|
450
|
-
evalscope-0.
|
|
451
|
-
evalscope-0.
|
|
452
|
-
evalscope-0.
|
|
453
|
-
evalscope-0.
|
|
454
|
-
evalscope-0.
|
|
470
|
+
evalscope-0.16.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
471
|
+
evalscope-0.16.1.dist-info/METADATA,sha256=H8eaMzt6o5k2wFIKnwBdTCPXnAexGvM-0PQqc16iKI4,36244
|
|
472
|
+
evalscope-0.16.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
473
|
+
evalscope-0.16.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
474
|
+
evalscope-0.16.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
475
|
+
evalscope-0.16.1.dist-info/RECORD,,
|
tests/aigc/test_t2i.py
CHANGED
|
@@ -11,7 +11,7 @@ from evalscope.run import run_task
|
|
|
11
11
|
from evalscope.utils import test_level_list
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
13
13
|
|
|
14
|
-
os.environ['
|
|
14
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
15
15
|
|
|
16
16
|
logger = get_logger()
|
|
17
17
|
|
|
@@ -58,9 +58,9 @@ class TestRun(unittest.TestCase):
|
|
|
58
58
|
'torch_dtype': 'torch.float16',
|
|
59
59
|
},
|
|
60
60
|
datasets=[
|
|
61
|
-
'tifa160',
|
|
61
|
+
# 'tifa160',
|
|
62
62
|
# 'genai_bench',
|
|
63
|
-
|
|
63
|
+
'evalmuse',
|
|
64
64
|
# 'hpdv2',
|
|
65
65
|
],
|
|
66
66
|
dataset_args={
|
|
@@ -85,3 +85,40 @@ class TestRun(unittest.TestCase):
|
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
run_task(task_cfg=task_cfg)
|
|
88
|
+
|
|
89
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
90
|
+
def test_run_benchmark_flux(self):
|
|
91
|
+
|
|
92
|
+
task_cfg = TaskConfig(
|
|
93
|
+
model='black-forest-labs/FLUX.1-dev', # model on modelscope
|
|
94
|
+
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
95
|
+
model_args={
|
|
96
|
+
'torch_dtype': 'torch.float16',
|
|
97
|
+
},
|
|
98
|
+
datasets=[
|
|
99
|
+
# 'tifa160',
|
|
100
|
+
# 'genai_bench',
|
|
101
|
+
'evalmuse',
|
|
102
|
+
# 'hpdv2',
|
|
103
|
+
],
|
|
104
|
+
dataset_args={
|
|
105
|
+
'tifa160': {
|
|
106
|
+
'metric_list': [
|
|
107
|
+
'PickScore',
|
|
108
|
+
# 'CLIPScore',
|
|
109
|
+
# 'HPSv2Score',
|
|
110
|
+
# 'BLIPv2Score',
|
|
111
|
+
# 'ImageRewardScore',
|
|
112
|
+
# 'VQAScore',
|
|
113
|
+
# 'FGA_BLIP2Score',
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
},
|
|
117
|
+
generation_config={
|
|
118
|
+
'num_inference_steps': 50,
|
|
119
|
+
'guidance_scale': 3.5
|
|
120
|
+
},
|
|
121
|
+
use_cache='outputs/20250520_112314'
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
run_task(task_cfg=task_cfg)
|