evalscope 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +20 -25
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/data_adapter.py +97 -16
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +4 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/utils.py +25 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +18 -6
- evalscope/config.py +8 -2
- evalscope/evaluator/evaluator.py +38 -27
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/models/adapters/server_adapter.py +2 -6
- evalscope/perf/arguments.py +2 -2
- evalscope/perf/benchmark.py +0 -9
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +60 -3
- evalscope/run.py +12 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/METADATA +13 -11
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/RECORD +61 -50
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -35
- tests/cli/test_collection.py +7 -6
- tests/cli/test_run.py +21 -11
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
evalscope/utils/logger.py
CHANGED
|
@@ -10,7 +10,7 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
10
10
|
|
|
11
11
|
detailed_formatter = logging.Formatter(detailed_format)
|
|
12
12
|
simple_formatter = logging.Formatter(simple_format)
|
|
13
|
-
DEFAULT_LEVEL = logging.DEBUG if os.getenv('
|
|
13
|
+
DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
14
14
|
|
|
15
15
|
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
|
|
16
16
|
|
evalscope/utils/utils.py
CHANGED
|
@@ -10,6 +10,7 @@ import os
|
|
|
10
10
|
import random
|
|
11
11
|
import re
|
|
12
12
|
import torch
|
|
13
|
+
from inspect import signature
|
|
13
14
|
from typing import Any, Dict, List, Tuple, Union
|
|
14
15
|
|
|
15
16
|
from evalscope.utils.logger import get_logger
|
|
@@ -313,6 +314,17 @@ def seed_everything(seed: int):
|
|
|
313
314
|
torch.backends.cudnn.deterministic = True
|
|
314
315
|
torch.backends.cudnn.benchmark = False
|
|
315
316
|
|
|
317
|
+
def get_supported_params(func):
|
|
318
|
+
"""Get the supported parameters of a function."""
|
|
319
|
+
sig = signature(func)
|
|
320
|
+
return list(sig.parameters.keys())
|
|
321
|
+
|
|
322
|
+
def parse_int_or_float(num):
|
|
323
|
+
number = float(num)
|
|
324
|
+
if number.is_integer():
|
|
325
|
+
return int(number)
|
|
326
|
+
return number
|
|
327
|
+
|
|
316
328
|
if __name__ == '__main__':
|
|
317
329
|
options = ['A', 'B', 'C', 'D']
|
|
318
330
|
answers = ['Context .... ANSWER: A', 'answer: A']
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.16.
|
|
3
|
+
Version: 0.16.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -17,12 +17,12 @@ Requires-Python: >=3.8
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: accelerate
|
|
20
|
-
Requires-Dist: datasets
|
|
20
|
+
Requires-Dist: datasets>=3.0
|
|
21
21
|
Requires-Dist: immutabledict
|
|
22
22
|
Requires-Dist: jieba
|
|
23
23
|
Requires-Dist: jsonlines
|
|
24
24
|
Requires-Dist: langdetect
|
|
25
|
-
Requires-Dist: latex2sympy2
|
|
25
|
+
Requires-Dist: latex2sympy2-extended
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
@@ -52,12 +52,12 @@ Requires-Dist: open-clip-torch; extra == "aigc"
|
|
|
52
52
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
53
53
|
Provides-Extra: all
|
|
54
54
|
Requires-Dist: accelerate; extra == "all"
|
|
55
|
-
Requires-Dist: datasets
|
|
55
|
+
Requires-Dist: datasets>=3.0; extra == "all"
|
|
56
56
|
Requires-Dist: immutabledict; extra == "all"
|
|
57
57
|
Requires-Dist: jieba; extra == "all"
|
|
58
58
|
Requires-Dist: jsonlines; extra == "all"
|
|
59
59
|
Requires-Dist: langdetect; extra == "all"
|
|
60
|
-
Requires-Dist: latex2sympy2; extra == "all"
|
|
60
|
+
Requires-Dist: latex2sympy2-extended; extra == "all"
|
|
61
61
|
Requires-Dist: matplotlib; extra == "all"
|
|
62
62
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
63
63
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -79,13 +79,13 @@ Requires-Dist: torchvision; extra == "all"
|
|
|
79
79
|
Requires-Dist: tqdm; extra == "all"
|
|
80
80
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
81
81
|
Requires-Dist: word2number; extra == "all"
|
|
82
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
83
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
82
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
|
|
83
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
|
|
84
84
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
85
85
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
86
86
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
87
87
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
88
|
-
Requires-Dist: mteb==1.
|
|
88
|
+
Requires-Dist: mteb==1.38.20; extra == "all"
|
|
89
89
|
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
90
90
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
91
91
|
Requires-Dist: aiohttp; extra == "all"
|
|
@@ -106,7 +106,7 @@ Provides-Extra: app
|
|
|
106
106
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
107
107
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
108
108
|
Provides-Extra: opencompass
|
|
109
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
109
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
110
110
|
Provides-Extra: perf
|
|
111
111
|
Requires-Dist: aiohttp; extra == "perf"
|
|
112
112
|
Requires-Dist: fastapi; extra == "perf"
|
|
@@ -120,11 +120,11 @@ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
|
120
120
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
121
121
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
122
122
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
123
|
-
Requires-Dist: mteb==1.
|
|
123
|
+
Requires-Dist: mteb==1.38.20; extra == "rag"
|
|
124
124
|
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
125
125
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
126
126
|
Provides-Extra: vlmeval
|
|
127
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
127
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
128
128
|
|
|
129
129
|
<p align="center">
|
|
130
130
|
<br>
|
|
@@ -230,6 +230,8 @@ Please scan the QR code below to join our community groups:
|
|
|
230
230
|
|
|
231
231
|
## 🎉 News
|
|
232
232
|
|
|
233
|
+
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
234
|
+
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
233
235
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
234
236
|
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
235
237
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
|
|
3
|
+
evalscope/config.py,sha256=HGvIlhjVjA9QtAiNEUrx_hev3wa-RaNEXelEiLJn9OM,11015
|
|
4
4
|
evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=saHZGlwbBLYtFk4BmKkjQEOOHQQ-pDKzN21taao6Os0,6957
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=vMuGTezikPNdTLYlejHdHznB5WhuHCnAhaOdw3iqU5E,119
|
|
9
|
+
evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
|
|
10
|
+
evalscope/app/app.py,sha256=sTYoc3Uag7DqYbb_qXo8QJX4oer8dueQK1wdgaLlTiY,29371
|
|
11
|
+
evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
|
|
12
|
+
evalscope/app/constants.py,sha256=KpItEl9lF0VldOm0grjS7RVbbseemtsXZJKtgGmAQB8,361
|
|
9
13
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
14
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
15
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
12
16
|
evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
|
|
13
|
-
evalscope/backend/opencompass/backend_manager.py,sha256=
|
|
17
|
+
evalscope/backend/opencompass/backend_manager.py,sha256=kIPzirjAOW0_YNQiCrhjRfAVD3UpcGmr4RXBH-WMH0Y,10409
|
|
14
18
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
15
19
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
|
|
16
20
|
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
|
|
@@ -27,12 +31,12 @@ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0U
|
|
|
27
31
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
|
|
28
32
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
|
|
29
33
|
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
|
|
30
|
-
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=
|
|
34
|
+
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=xROhoVxJvMhhU9S5SKtiavQHM447esbrVWlbmes4AVI,2814
|
|
31
35
|
evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
|
|
32
|
-
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=
|
|
36
|
+
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=vPfbBvtVjX6U6QHEG5mRP9CQjFMF-_8EdrpYoNHbDFU,3303
|
|
33
37
|
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
|
|
34
38
|
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=-GTwORxILSkkXXGtTxuPTKSHNXQEllCRoUjuR7pnwFM,8962
|
|
35
|
-
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=
|
|
39
|
+
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=_uuDPaerh6qbxw7W3DiPrWuxfEyLeKCHeduYcp-1Veg,2025
|
|
36
40
|
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=yISp67pXw4fSrsqTiYmfas6uPyqwE45L1c58Tpydc0E,4075
|
|
37
41
|
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=AH7jwJ45WAVxVb60I2DTURVanIAbrlZzk-ey_dHWEO0,5491
|
|
38
42
|
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9GUpuEEmcWwc78Q7ZJjRDZs,11454
|
|
@@ -49,15 +53,15 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8m
|
|
|
49
53
|
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
|
|
50
54
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
55
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
|
-
evalscope/backend/rag_eval/utils/embedding.py,sha256=
|
|
56
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=3CkLX6SXGAc6ltUQe4V_IcTr71cZSane5-VjaRYn13M,9466
|
|
53
57
|
evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
|
|
54
58
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
59
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
60
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
|
|
57
61
|
evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
|
|
58
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
59
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
60
|
-
evalscope/benchmarks/utils.py,sha256=
|
|
62
|
+
evalscope/benchmarks/benchmark.py,sha256=X-vBzz5PDVI5rBbqWpiUZq0bmGhp9cRZiA27XCgxPdE,2573
|
|
63
|
+
evalscope/benchmarks/data_adapter.py,sha256=Z2s4mfJssxNAeFPVNgZLkBbc3DBbJRZNGbRBigLe4I4,22893
|
|
64
|
+
evalscope/benchmarks/utils.py,sha256=81MwUJYWjJgoiRClY-IFB-EZN0th-oQDTvU2ekaEmpc,1869
|
|
61
65
|
evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
66
|
evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
67
|
evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
|
|
@@ -120,9 +124,15 @@ evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc
|
|
|
120
124
|
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=wgejW-_QswtT8_3JKAQ_H6svH8IotDJDBEH7X4nP4bY,6760
|
|
121
125
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
122
126
|
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3q53T-lu1UWTV6T4h1cKGoCQDh0O4QxFezw,2569
|
|
127
|
+
evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
+
evalscope/benchmarks/docmath/docmath_adapter.py,sha256=GAoHuFASKyWCVbB0nmImsEB-YCREwB75WjdqYB0CcyU,2912
|
|
129
|
+
evalscope/benchmarks/docmath/utils.py,sha256=ptd-Sot4QtUmUG4dMlqXtUWHKZplo5jSTolsypqX9Ho,7716
|
|
123
130
|
evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
131
|
evalscope/benchmarks/drop/drop_adapter.py,sha256=V-Vx6g2_1kcDUDWOKVX1vPSLt5iHn8NQkpWbsIwPaa4,8325
|
|
125
132
|
evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
|
|
133
|
+
evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
|
+
evalscope/benchmarks/frames/frames_adapter.py,sha256=wbug6yDlq6N5SfCQaOn43K8klJjrZc9iigFEPQs5nKA,3096
|
|
135
|
+
evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
|
|
126
136
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
127
137
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
|
|
128
138
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -169,6 +179,9 @@ evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
169
179
|
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=Kr30i_exxBJRz9PLB5g6F04e2HJ4WuF6LDyAwaRh2MY,9578
|
|
170
180
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
181
|
evalscope/benchmarks/musr/musr_adapter.py,sha256=85P0sY7H9pthYdCjkE2AOxaiNhcIBW1iZmODkz3FN0M,2464
|
|
182
|
+
evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
183
|
+
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=rNi7ULskhhHh1eVN1eV15gyLVFE05uertlZlCzMzgOE,15355
|
|
184
|
+
evalscope/benchmarks/needle_haystack/utils.py,sha256=bDwtpMS7Eqr63urCttS9i3BvT_aPuNvrQU-vEc6tcx0,2911
|
|
172
185
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
173
186
|
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
174
187
|
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
|
|
@@ -184,8 +197,8 @@ evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6i
|
|
|
184
197
|
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
185
198
|
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
186
199
|
evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
187
|
-
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=
|
|
188
|
-
evalscope/benchmarks/tool_bench/utils.py,sha256=
|
|
200
|
+
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=c8_Cok_wctlBtWd7kDQY9McaFbkWsW9LTC5JzPpef-Q,2399
|
|
201
|
+
evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
|
|
189
202
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
190
203
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
191
204
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
@@ -198,27 +211,27 @@ evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=UdANz3YmCtV2YfGuEih
|
|
|
198
211
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
199
212
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
200
213
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
201
|
-
evalscope/cli/start_app.py,sha256=
|
|
214
|
+
evalscope/cli/start_app.py,sha256=dV63nvBYEUl2sGeVxoUH4IJBXJSLecaq293i3alBWxo,794
|
|
202
215
|
evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,775
|
|
203
216
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
204
217
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
205
|
-
evalscope/collections/__init__.py,sha256=
|
|
206
|
-
evalscope/collections/evaluator.py,sha256=
|
|
218
|
+
evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCeR2Io,853
|
|
219
|
+
evalscope/collections/evaluator.py,sha256=NnLel9lOyR0wzOwxDGSCFWJN4zFx9ZA2hc0PI-FSvl0,16200
|
|
207
220
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
208
221
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
209
222
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
210
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
223
|
+
evalscope/evaluator/evaluator.py,sha256=d8cFq08oJ6kbKcwr4mVh517OxndgyqUrmuEP-bwmR6g,22071
|
|
211
224
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
212
225
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
213
226
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
|
|
214
|
-
evalscope/metrics/__init__.py,sha256=
|
|
215
|
-
evalscope/metrics/llm_judge.py,sha256=
|
|
216
|
-
evalscope/metrics/math_parser.py,sha256=
|
|
227
|
+
evalscope/metrics/__init__.py,sha256=g96dZSt3Dh56TdVbe4yDqcfmr9DoLqH-R2__3Qvorjk,1497
|
|
228
|
+
evalscope/metrics/llm_judge.py,sha256=O2IaJpsBe1HqfCVnRYOt_PLWg6w85DYlYLU7yTq5idw,4384
|
|
229
|
+
evalscope/metrics/math_parser.py,sha256=JtOkj28XOtwoUACXOXLzCeRYz0rx0tBsQLQDU8cbC20,17311
|
|
217
230
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
218
231
|
evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
|
|
219
232
|
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
220
233
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
221
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
234
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=T91PgJfi1As7BR7I-Hq6rLlvHAtMB9JpBw9gMTH8VlE,12114
|
|
222
235
|
evalscope/metrics/t2v_metrics/__init__.py,sha256=GBxgKTPVy_qhW_F3M4Oi6QMWhdAi4PqGX5w3t6Tueho,1783
|
|
223
236
|
evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
|
|
224
237
|
evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
|
|
@@ -329,16 +342,16 @@ evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj
|
|
|
329
342
|
evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
|
|
330
343
|
evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
|
|
331
344
|
evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
|
|
332
|
-
evalscope/models/adapters/server_adapter.py,sha256=
|
|
345
|
+
evalscope/models/adapters/server_adapter.py,sha256=qdonCJLoM0qmFQtHziczUqVzA31p4AxIn2j9oNIosLw,6493
|
|
333
346
|
evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
|
|
334
347
|
evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
|
|
335
348
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
336
349
|
evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
|
|
337
350
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
338
|
-
evalscope/perf/arguments.py,sha256=
|
|
339
|
-
evalscope/perf/benchmark.py,sha256=
|
|
351
|
+
evalscope/perf/arguments.py,sha256=HUKzcU-FBt34DgGJ0nc5rNgJAMpZwYQXMz8VU8jokco,10668
|
|
352
|
+
evalscope/perf/benchmark.py,sha256=qEgIX_Z4x3FNtAKTMlP2mRJTerRV5seCbVtB4XklnQI,7566
|
|
340
353
|
evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
|
|
341
|
-
evalscope/perf/main.py,sha256=
|
|
354
|
+
evalscope/perf/main.py,sha256=yfJWGd2l4uU_qKW9bD6DzV0DK9XXuCJGLYjF_JWR22E,3394
|
|
342
355
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
343
356
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
344
357
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -348,7 +361,7 @@ evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4I
|
|
|
348
361
|
evalscope/perf/plugin/api/openai_api.py,sha256=kTL_2OACuKhzd2W0Pf4DirpMumzk4V3rqKZ2mvBZVCs,7655
|
|
349
362
|
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
350
363
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
351
|
-
evalscope/perf/plugin/datasets/custom.py,sha256
|
|
364
|
+
evalscope/perf/plugin/datasets/custom.py,sha256=-meul2hRmYvYAo--c_EtCnItRi5DvN7xxFOpq6vqdts,1346
|
|
352
365
|
evalscope/perf/plugin/datasets/flickr8k.py,sha256=MbJKEB0XqZE0nDEenwYs0FLH9QL658Vn9uQmUH4hPvk,1605
|
|
353
366
|
evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
|
|
354
367
|
evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
|
|
@@ -357,12 +370,12 @@ evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANB
|
|
|
357
370
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
358
371
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
359
372
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
360
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
373
|
+
evalscope/perf/utils/benchmark_util.py,sha256=EPKUDijue85b8KhSJoJKLh6comkTKRjq2yoEw4kxBho,7227
|
|
361
374
|
evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
|
|
362
375
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
363
|
-
evalscope/perf/utils/local_server.py,sha256=
|
|
364
|
-
evalscope/perf/utils/log_utils.py,sha256=
|
|
365
|
-
evalscope/perf/utils/rich_display.py,sha256=
|
|
376
|
+
evalscope/perf/utils/local_server.py,sha256=RL9rGd5tEniZ0aErhHcbVXMX22YmujfE11T3j37VL8k,4684
|
|
377
|
+
evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
|
|
378
|
+
evalscope/perf/utils/rich_display.py,sha256=xZzeryQbYM6Cv8g1ulK6OQUE2CalQ_KtFxiy7pioeEU,8127
|
|
366
379
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
367
380
|
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
368
381
|
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
@@ -384,12 +397,10 @@ evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNi
|
|
|
384
397
|
evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
|
|
385
398
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
386
399
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
387
|
-
evalscope/report/__init__.py,sha256=
|
|
388
|
-
evalscope/report/
|
|
389
|
-
evalscope/report/
|
|
390
|
-
evalscope/report/
|
|
391
|
-
evalscope/report/generator.py,sha256=q9aHWNjQgvutAKtpjfWOpfu5zNFdnXilO9OqBqt_Phg,3612
|
|
392
|
-
evalscope/report/utils.py,sha256=uu-rAzoN6ZIlv52IDWSZCcmNVY3DscNo2f9H9-gjZHY,4602
|
|
400
|
+
evalscope/report/__init__.py,sha256=mLCgT7G-WPagQHOGz97AOdLQJjyikrswDiXA8d9Wr_Q,923
|
|
401
|
+
evalscope/report/combinator.py,sha256=xGX0B6tGZxaEB20tziPQm3HUkvgftghKg5AEQ8JpsBE,2842
|
|
402
|
+
evalscope/report/generator.py,sha256=oykmQROG-Bt8ttCH4RtvmGJ39HmDJMTU6gG26lg5LHE,4321
|
|
403
|
+
evalscope/report/utils.py,sha256=KAc4Cq8NMxTUjCJHI5MK3ZqzBNjfDMXrwLBpUkaywjk,6520
|
|
393
404
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
394
405
|
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
395
406
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
@@ -433,22 +444,22 @@ evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-J
|
|
|
433
444
|
evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
434
445
|
evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
|
|
435
446
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
436
|
-
evalscope/utils/logger.py,sha256=
|
|
447
|
+
evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
|
|
437
448
|
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
438
|
-
evalscope/utils/utils.py,sha256=
|
|
449
|
+
evalscope/utils/utils.py,sha256=P5gmpINv5UQrwEMrFZKZjdJspsOdGjaBARfRSDVNOd0,11414
|
|
439
450
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
440
451
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
441
452
|
tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
442
|
-
tests/aigc/test_t2i.py,sha256=
|
|
453
|
+
tests/aigc/test_t2i.py,sha256=YjEAwlM8cBfGCGOguz86UebJjJ5bsc3jhs4SQqyxwZs,3844
|
|
443
454
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
444
|
-
tests/cli/test_all.py,sha256=
|
|
445
|
-
tests/cli/test_collection.py,sha256=
|
|
446
|
-
tests/cli/test_run.py,sha256=
|
|
455
|
+
tests/cli/test_all.py,sha256=noGE54iWnmoPGTsN2PGh7_jM5ceehN6bMnp6xxq4s3A,4240
|
|
456
|
+
tests/cli/test_collection.py,sha256=H7enYWGTmp2VRio-WTEfPRdkf3y-T4fs43Kqf81mbrQ,4181
|
|
457
|
+
tests/cli/test_run.py,sha256=OER_I6FeJAMUA2IN0zKUdUIeRDr8mJFaOiEpwQjYbnE,18166
|
|
447
458
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
448
459
|
tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
|
|
449
460
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
450
461
|
tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
|
|
451
|
-
tests/rag/test_mteb.py,sha256=
|
|
462
|
+
tests/rag/test_mteb.py,sha256=PaWS5GrZdMO680M129QP2EG000rVq7f2iP3n0YDAv-w,5611
|
|
452
463
|
tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
|
|
453
464
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
454
465
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
@@ -456,9 +467,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
456
467
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
457
468
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
458
469
|
tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
|
|
459
|
-
evalscope-0.16.
|
|
460
|
-
evalscope-0.16.
|
|
461
|
-
evalscope-0.16.
|
|
462
|
-
evalscope-0.16.
|
|
463
|
-
evalscope-0.16.
|
|
464
|
-
evalscope-0.16.
|
|
470
|
+
evalscope-0.16.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
471
|
+
evalscope-0.16.1.dist-info/METADATA,sha256=H8eaMzt6o5k2wFIKnwBdTCPXnAexGvM-0PQqc16iKI4,36244
|
|
472
|
+
evalscope-0.16.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
473
|
+
evalscope-0.16.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
474
|
+
evalscope-0.16.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
475
|
+
evalscope-0.16.1.dist-info/RECORD,,
|
tests/aigc/test_t2i.py
CHANGED
|
@@ -11,7 +11,7 @@ from evalscope.run import run_task
|
|
|
11
11
|
from evalscope.utils import test_level_list
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
13
13
|
|
|
14
|
-
os.environ['
|
|
14
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
15
15
|
|
|
16
16
|
logger = get_logger()
|
|
17
17
|
|
|
@@ -58,9 +58,9 @@ class TestRun(unittest.TestCase):
|
|
|
58
58
|
'torch_dtype': 'torch.float16',
|
|
59
59
|
},
|
|
60
60
|
datasets=[
|
|
61
|
-
'tifa160',
|
|
61
|
+
# 'tifa160',
|
|
62
62
|
# 'genai_bench',
|
|
63
|
-
|
|
63
|
+
'evalmuse',
|
|
64
64
|
# 'hpdv2',
|
|
65
65
|
],
|
|
66
66
|
dataset_args={
|
|
@@ -85,3 +85,40 @@ class TestRun(unittest.TestCase):
|
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
run_task(task_cfg=task_cfg)
|
|
88
|
+
|
|
89
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
90
|
+
def test_run_benchmark_flux(self):
|
|
91
|
+
|
|
92
|
+
task_cfg = TaskConfig(
|
|
93
|
+
model='black-forest-labs/FLUX.1-dev', # model on modelscope
|
|
94
|
+
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
95
|
+
model_args={
|
|
96
|
+
'torch_dtype': 'torch.float16',
|
|
97
|
+
},
|
|
98
|
+
datasets=[
|
|
99
|
+
# 'tifa160',
|
|
100
|
+
# 'genai_bench',
|
|
101
|
+
'evalmuse',
|
|
102
|
+
# 'hpdv2',
|
|
103
|
+
],
|
|
104
|
+
dataset_args={
|
|
105
|
+
'tifa160': {
|
|
106
|
+
'metric_list': [
|
|
107
|
+
'PickScore',
|
|
108
|
+
# 'CLIPScore',
|
|
109
|
+
# 'HPSv2Score',
|
|
110
|
+
# 'BLIPv2Score',
|
|
111
|
+
# 'ImageRewardScore',
|
|
112
|
+
# 'VQAScore',
|
|
113
|
+
# 'FGA_BLIP2Score',
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
},
|
|
117
|
+
generation_config={
|
|
118
|
+
'num_inference_steps': 50,
|
|
119
|
+
'guidance_scale': 3.5
|
|
120
|
+
},
|
|
121
|
+
use_cache='outputs/20250520_112314'
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_all.py
CHANGED
|
@@ -12,43 +12,46 @@ from evalscope.run import run_task
|
|
|
12
12
|
from evalscope.utils import test_level_list
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
14
14
|
|
|
15
|
-
os.environ['
|
|
15
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
16
16
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
19
19
|
datasets=[
|
|
20
|
-
'iquiz',
|
|
21
|
-
'ifeval',
|
|
22
|
-
'mmlu',
|
|
23
|
-
'mmlu_pro',
|
|
24
|
-
'musr',
|
|
25
|
-
'process_bench',
|
|
26
|
-
'race',
|
|
27
|
-
'trivia_qa',
|
|
28
|
-
'cmmlu',
|
|
29
|
-
'humaneval',
|
|
30
|
-
'gsm8k',
|
|
31
|
-
'bbh',
|
|
32
|
-
'competition_math',
|
|
33
|
-
'math_500',
|
|
34
|
-
'aime24',
|
|
35
|
-
'gpqa',
|
|
36
|
-
'arc',
|
|
37
|
-
'ceval',
|
|
38
|
-
'hellaswag',
|
|
39
|
-
'general_mcq',
|
|
40
|
-
'general_qa',
|
|
41
|
-
'super_gpqa',
|
|
42
|
-
'live_code_bench',
|
|
43
|
-
'mmlu_redux',
|
|
44
|
-
'simple_qa',
|
|
45
|
-
'chinese_simpleqa',
|
|
46
|
-
'alpaca_eval',
|
|
47
|
-
'arena_hard',
|
|
48
|
-
'maritime_bench',
|
|
49
|
-
'drop',
|
|
50
|
-
'winogrande',
|
|
51
|
-
'tool_bench',
|
|
20
|
+
# 'iquiz',
|
|
21
|
+
# 'ifeval',
|
|
22
|
+
# 'mmlu',
|
|
23
|
+
# 'mmlu_pro',
|
|
24
|
+
# 'musr',
|
|
25
|
+
# 'process_bench',
|
|
26
|
+
# 'race',
|
|
27
|
+
# 'trivia_qa',
|
|
28
|
+
# 'cmmlu',
|
|
29
|
+
# 'humaneval',
|
|
30
|
+
# 'gsm8k',
|
|
31
|
+
# 'bbh',
|
|
32
|
+
# 'competition_math',
|
|
33
|
+
# 'math_500',
|
|
34
|
+
# 'aime24',
|
|
35
|
+
# 'gpqa',
|
|
36
|
+
# 'arc',
|
|
37
|
+
# 'ceval',
|
|
38
|
+
# 'hellaswag',
|
|
39
|
+
# 'general_mcq',
|
|
40
|
+
# 'general_qa',
|
|
41
|
+
# 'super_gpqa',
|
|
42
|
+
# 'live_code_bench',
|
|
43
|
+
# 'mmlu_redux',
|
|
44
|
+
# 'simple_qa',
|
|
45
|
+
# 'chinese_simpleqa',
|
|
46
|
+
# 'alpaca_eval',
|
|
47
|
+
# 'arena_hard',
|
|
48
|
+
# 'maritime_bench',
|
|
49
|
+
# 'drop',
|
|
50
|
+
# 'winogrande',
|
|
51
|
+
# 'tool_bench',
|
|
52
|
+
'frames',
|
|
53
|
+
'docmath',
|
|
54
|
+
'needle_haystack'
|
|
52
55
|
]
|
|
53
56
|
|
|
54
57
|
dataset_args={
|
|
@@ -131,7 +134,7 @@ class TestRun(unittest.TestCase):
|
|
|
131
134
|
from evalscope.config import TaskConfig
|
|
132
135
|
|
|
133
136
|
task_cfg = TaskConfig(
|
|
134
|
-
model='
|
|
137
|
+
model='qwen-plus',
|
|
135
138
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
136
139
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
137
140
|
eval_type=EvalType.SERVICE,
|
|
@@ -145,9 +148,10 @@ class TestRun(unittest.TestCase):
|
|
|
145
148
|
'n': 1,
|
|
146
149
|
'max_tokens': 4096,
|
|
147
150
|
},
|
|
151
|
+
judge_worker_num=5,
|
|
148
152
|
judge_strategy=JudgeStrategy.AUTO,
|
|
149
153
|
judge_model_args={
|
|
150
|
-
'model_id': 'qwen2.5-
|
|
154
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
151
155
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
152
156
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
153
157
|
}
|
tests/cli/test_collection.py
CHANGED
|
@@ -72,14 +72,15 @@ class TestCollection(unittest.TestCase):
|
|
|
72
72
|
'local_path': 'outputs/mixed_data_test.jsonl'
|
|
73
73
|
# 'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
74
74
|
}},
|
|
75
|
-
limit=
|
|
76
|
-
judge_strategy=JudgeStrategy.
|
|
75
|
+
limit=5,
|
|
76
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
77
77
|
judge_model_args={
|
|
78
|
-
'model_id': 'qwen2.5-
|
|
79
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
-
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
78
|
+
# 'model_id': 'qwen2.5-72b-instruct',
|
|
79
|
+
# 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
+
# 'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
81
|
},
|
|
82
|
-
|
|
82
|
+
analysis_report=True,
|
|
83
|
+
# use_cache='outputs/20250522_204520'
|
|
83
84
|
)
|
|
84
85
|
res = run_task(task_cfg=task_cfg)
|
|
85
86
|
print(res)
|