evalscope 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (61) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +20 -25
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  7. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  8. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  9. evalscope/backend/rag_eval/utils/embedding.py +75 -35
  10. evalscope/benchmarks/benchmark.py +1 -0
  11. evalscope/benchmarks/data_adapter.py +97 -16
  12. evalscope/benchmarks/docmath/__init__.py +0 -0
  13. evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  14. evalscope/benchmarks/docmath/utils.py +220 -0
  15. evalscope/benchmarks/frames/__init__.py +0 -0
  16. evalscope/benchmarks/frames/frames_adapter.py +90 -0
  17. evalscope/benchmarks/frames/utils.py +37 -0
  18. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  19. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  20. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  21. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +4 -1
  22. evalscope/benchmarks/tool_bench/utils.py +5 -4
  23. evalscope/benchmarks/utils.py +25 -0
  24. evalscope/cli/start_app.py +2 -2
  25. evalscope/collections/__init__.py +35 -3
  26. evalscope/collections/evaluator.py +18 -6
  27. evalscope/config.py +8 -2
  28. evalscope/evaluator/evaluator.py +38 -27
  29. evalscope/metrics/__init__.py +3 -1
  30. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  31. evalscope/metrics/llm_judge.py +12 -5
  32. evalscope/metrics/math_parser.py +1 -1
  33. evalscope/models/adapters/server_adapter.py +2 -6
  34. evalscope/perf/arguments.py +2 -2
  35. evalscope/perf/benchmark.py +0 -9
  36. evalscope/perf/main.py +7 -0
  37. evalscope/perf/plugin/datasets/custom.py +15 -0
  38. evalscope/perf/utils/benchmark_util.py +1 -1
  39. evalscope/perf/utils/local_server.py +1 -0
  40. evalscope/perf/utils/log_utils.py +12 -5
  41. evalscope/perf/utils/rich_display.py +1 -1
  42. evalscope/report/__init__.py +36 -4
  43. evalscope/report/combinator.py +8 -0
  44. evalscope/report/generator.py +33 -9
  45. evalscope/report/utils.py +60 -3
  46. evalscope/run.py +12 -0
  47. evalscope/utils/logger.py +1 -1
  48. evalscope/utils/utils.py +12 -0
  49. evalscope/version.py +2 -2
  50. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/METADATA +13 -11
  51. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/RECORD +61 -50
  52. tests/aigc/test_t2i.py +40 -3
  53. tests/cli/test_all.py +39 -35
  54. tests/cli/test_collection.py +7 -6
  55. tests/cli/test_run.py +21 -11
  56. tests/rag/test_mteb.py +5 -5
  57. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  58. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
  59. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
  60. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
  61. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
evalscope/utils/logger.py CHANGED
@@ -10,7 +10,7 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10
10
 
11
11
  detailed_formatter = logging.Formatter(detailed_format)
12
12
  simple_formatter = logging.Formatter(simple_format)
13
- DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
13
+ DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
14
14
 
15
15
  logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
16
16
 
evalscope/utils/utils.py CHANGED
@@ -10,6 +10,7 @@ import os
10
10
  import random
11
11
  import re
12
12
  import torch
13
+ from inspect import signature
13
14
  from typing import Any, Dict, List, Tuple, Union
14
15
 
15
16
  from evalscope.utils.logger import get_logger
@@ -313,6 +314,17 @@ def seed_everything(seed: int):
313
314
  torch.backends.cudnn.deterministic = True
314
315
  torch.backends.cudnn.benchmark = False
315
316
 
317
+ def get_supported_params(func):
318
+ """Get the supported parameters of a function."""
319
+ sig = signature(func)
320
+ return list(sig.parameters.keys())
321
+
322
+ def parse_int_or_float(num):
323
+ number = float(num)
324
+ if number.is_integer():
325
+ return int(number)
326
+ return number
327
+
316
328
  if __name__ == '__main__':
317
329
  options = ['A', 'B', 'C', 'D']
318
330
  answers = ['Context .... ANSWER: A', 'answer: A']
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.16.0'
4
- __release_datetime__ = '2025-05-19 18:00:00'
3
+ __version__ = '0.16.1'
4
+ __release_datetime__ = '2025-06-03 20:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.16.0
3
+ Version: 0.16.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -17,12 +17,12 @@ Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
19
  Requires-Dist: accelerate
20
- Requires-Dist: datasets<=3.2.0,>=3.0.0
20
+ Requires-Dist: datasets>=3.0
21
21
  Requires-Dist: immutabledict
22
22
  Requires-Dist: jieba
23
23
  Requires-Dist: jsonlines
24
24
  Requires-Dist: langdetect
25
- Requires-Dist: latex2sympy2
25
+ Requires-Dist: latex2sympy2-extended
26
26
  Requires-Dist: matplotlib
27
27
  Requires-Dist: modelscope[framework]
28
28
  Requires-Dist: nltk>=3.9
@@ -52,12 +52,12 @@ Requires-Dist: open-clip-torch; extra == "aigc"
52
52
  Requires-Dist: opencv-python; extra == "aigc"
53
53
  Provides-Extra: all
54
54
  Requires-Dist: accelerate; extra == "all"
55
- Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
55
+ Requires-Dist: datasets>=3.0; extra == "all"
56
56
  Requires-Dist: immutabledict; extra == "all"
57
57
  Requires-Dist: jieba; extra == "all"
58
58
  Requires-Dist: jsonlines; extra == "all"
59
59
  Requires-Dist: langdetect; extra == "all"
60
- Requires-Dist: latex2sympy2; extra == "all"
60
+ Requires-Dist: latex2sympy2-extended; extra == "all"
61
61
  Requires-Dist: matplotlib; extra == "all"
62
62
  Requires-Dist: modelscope[framework]; extra == "all"
63
63
  Requires-Dist: nltk>=3.9; extra == "all"
@@ -79,13 +79,13 @@ Requires-Dist: torchvision; extra == "all"
79
79
  Requires-Dist: tqdm; extra == "all"
80
80
  Requires-Dist: transformers>=4.33; extra == "all"
81
81
  Requires-Dist: word2number; extra == "all"
82
- Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
83
- Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
82
+ Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
83
+ Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
84
84
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
85
85
  Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
86
86
  Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
87
87
  Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
88
- Requires-Dist: mteb==1.19.4; extra == "all"
88
+ Requires-Dist: mteb==1.38.20; extra == "all"
89
89
  Requires-Dist: ragas==0.2.14; extra == "all"
90
90
  Requires-Dist: webdataset>0.2.0; extra == "all"
91
91
  Requires-Dist: aiohttp; extra == "all"
@@ -106,7 +106,7 @@ Provides-Extra: app
106
106
  Requires-Dist: gradio==5.4.0; extra == "app"
107
107
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
108
108
  Provides-Extra: opencompass
109
- Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
109
+ Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
110
110
  Provides-Extra: perf
111
111
  Requires-Dist: aiohttp; extra == "perf"
112
112
  Requires-Dist: fastapi; extra == "perf"
@@ -120,11 +120,11 @@ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
120
120
  Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
121
121
  Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
122
122
  Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
123
- Requires-Dist: mteb==1.19.4; extra == "rag"
123
+ Requires-Dist: mteb==1.38.20; extra == "rag"
124
124
  Requires-Dist: ragas==0.2.14; extra == "rag"
125
125
  Requires-Dist: webdataset>0.2.0; extra == "rag"
126
126
  Provides-Extra: vlmeval
127
- Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
127
+ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
128
128
 
129
129
  <p align="center">
130
130
  <br>
@@ -230,6 +230,8 @@ Please scan the QR code below to join our community groups:
230
230
 
231
231
  ## 🎉 News
232
232
 
233
+ - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
234
+ - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
233
235
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
234
236
  - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
235
237
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
@@ -1,16 +1,20 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=fZW-om5E2_JaFcEmkvahvundjedPLgIDde-zwDXinG0,5868
3
- evalscope/config.py,sha256=19QaZ5VS8wknt4sLBxiZkR6pH-nm4Ph3Kl-1bZgcQcE,10799
2
+ evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
3
+ evalscope/config.py,sha256=HGvIlhjVjA9QtAiNEUrx_hev3wa-RaNEXelEiLJn9OM,11015
4
4
  evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
5
- evalscope/run.py,sha256=_DKbxgQGwhweBnQrI7lQhu5eoz4LYPVeNanzD4lHuJA,6476
5
+ evalscope/run.py,sha256=saHZGlwbBLYtFk4BmKkjQEOOHQQ-pDKzN21taao6Os0,6957
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
8
- evalscope/version.py,sha256=8STVV6Y877B3esrgvovInSk4IFNzxZ_ZEz9ND_6B2lQ,119
8
+ evalscope/version.py,sha256=vMuGTezikPNdTLYlejHdHznB5WhuHCnAhaOdw3iqU5E,119
9
+ evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
10
+ evalscope/app/app.py,sha256=sTYoc3Uag7DqYbb_qXo8QJX4oer8dueQK1wdgaLlTiY,29371
11
+ evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
12
+ evalscope/app/constants.py,sha256=KpItEl9lF0VldOm0grjS7RVbbseemtsXZJKtgGmAQB8,361
9
13
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
14
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
15
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
12
16
  evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
13
- evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-oyH3uMGL7M3nPp1WiHU,10381
17
+ evalscope/backend/opencompass/backend_manager.py,sha256=kIPzirjAOW0_YNQiCrhjRfAVD3UpcGmr4RXBH-WMH0Y,10409
14
18
  evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
15
19
  evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
16
20
  evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
@@ -27,12 +31,12 @@ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0U
27
31
  evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
28
32
  evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
29
33
  evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
30
- evalscope/backend/rag_eval/cmteb/arguments.py,sha256=y2iTbs3a7R747NgS00nK2j3zO7gmREh8n7mWMrzF1js,2653
34
+ evalscope/backend/rag_eval/cmteb/arguments.py,sha256=xROhoVxJvMhhU9S5SKtiavQHM447esbrVWlbmes4AVI,2814
31
35
  evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
32
- evalscope/backend/rag_eval/cmteb/task_template.py,sha256=FyFs1reefcsFCrWyi7Ya5dnFYvBhtxph2wIaFtOtFls,2595
36
+ evalscope/backend/rag_eval/cmteb/task_template.py,sha256=vPfbBvtVjX6U6QHEG5mRP9CQjFMF-_8EdrpYoNHbDFU,3303
33
37
  evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
34
38
  evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=-GTwORxILSkkXXGtTxuPTKSHNXQEllCRoUjuR7pnwFM,8962
35
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=pbZBfjeVAKbjLy4tEk6KUVDv-Rv8HNHYWuNkfqf-Vwk,2025
39
+ evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=_uuDPaerh6qbxw7W3DiPrWuxfEyLeKCHeduYcp-1Veg,2025
36
40
  evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=yISp67pXw4fSrsqTiYmfas6uPyqwE45L1c58Tpydc0E,4075
37
41
  evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=AH7jwJ45WAVxVb60I2DTURVanIAbrlZzk-ey_dHWEO0,5491
38
42
  evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9GUpuEEmcWwc78Q7ZJjRDZs,11454
@@ -49,15 +53,15 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8m
49
53
  evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
50
54
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
55
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
52
- evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
56
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=3CkLX6SXGAc6ltUQe4V_IcTr71cZSane5-VjaRYn13M,9466
53
57
  evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
54
58
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
55
59
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
56
60
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
57
61
  evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
58
- evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
59
- evalscope/benchmarks/data_adapter.py,sha256=lcBoXhI1Byn0HcwbVxmIeUFxZlz_wiqte6RDPOR8sbM,18184
60
- evalscope/benchmarks/utils.py,sha256=jB9w3mN1eOur6j2kpQB_XZJ912fhzC0GaSeHOoylK7M,1087
62
+ evalscope/benchmarks/benchmark.py,sha256=X-vBzz5PDVI5rBbqWpiUZq0bmGhp9cRZiA27XCgxPdE,2573
63
+ evalscope/benchmarks/data_adapter.py,sha256=Z2s4mfJssxNAeFPVNgZLkBbc3DBbJRZNGbRBigLe4I4,22893
64
+ evalscope/benchmarks/utils.py,sha256=81MwUJYWjJgoiRClY-IFB-EZN0th-oQDTvU2ekaEmpc,1869
61
65
  evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
66
  evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
67
  evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
@@ -120,9 +124,15 @@ evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc
120
124
  evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=wgejW-_QswtT8_3JKAQ_H6svH8IotDJDBEH7X4nP4bY,6760
121
125
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
126
  evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3q53T-lu1UWTV6T4h1cKGoCQDh0O4QxFezw,2569
127
+ evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
+ evalscope/benchmarks/docmath/docmath_adapter.py,sha256=GAoHuFASKyWCVbB0nmImsEB-YCREwB75WjdqYB0CcyU,2912
129
+ evalscope/benchmarks/docmath/utils.py,sha256=ptd-Sot4QtUmUG4dMlqXtUWHKZplo5jSTolsypqX9Ho,7716
123
130
  evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
131
  evalscope/benchmarks/drop/drop_adapter.py,sha256=V-Vx6g2_1kcDUDWOKVX1vPSLt5iHn8NQkpWbsIwPaa4,8325
125
132
  evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
133
+ evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
+ evalscope/benchmarks/frames/frames_adapter.py,sha256=wbug6yDlq6N5SfCQaOn43K8klJjrZc9iigFEPQs5nKA,3096
135
+ evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
126
136
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
127
137
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
128
138
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -169,6 +179,9 @@ evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
169
179
  evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=Kr30i_exxBJRz9PLB5g6F04e2HJ4WuF6LDyAwaRh2MY,9578
170
180
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
181
  evalscope/benchmarks/musr/musr_adapter.py,sha256=85P0sY7H9pthYdCjkE2AOxaiNhcIBW1iZmODkz3FN0M,2464
182
+ evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
183
+ evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=rNi7ULskhhHh1eVN1eV15gyLVFE05uertlZlCzMzgOE,15355
184
+ evalscope/benchmarks/needle_haystack/utils.py,sha256=bDwtpMS7Eqr63urCttS9i3BvT_aPuNvrQU-vEc6tcx0,2911
172
185
  evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
186
  evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
174
187
  evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
@@ -184,8 +197,8 @@ evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6i
184
197
  evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
185
198
  evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
186
199
  evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
187
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=l2dBcJ4Z3m-8QFtfyFH4IqMtvkY3Rfk021P9Ff_lXWQ,2270
188
- evalscope/benchmarks/tool_bench/utils.py,sha256=vIPsL8FmMF2JZRHCZeLS_dDeATKNRvZDbq6T-Znlk8Q,7025
200
+ evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=c8_Cok_wctlBtWd7kDQY9McaFbkWsW9LTC5JzPpef-Q,2399
201
+ evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
189
202
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
190
203
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
191
204
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
@@ -198,27 +211,27 @@ evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=UdANz3YmCtV2YfGuEih
198
211
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
199
212
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
200
213
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
201
- evalscope/cli/start_app.py,sha256=PoAnmYLw_UdWpA7qrUkSIx8hRoIGRy9yXrbH8bYOSL4,804
214
+ evalscope/cli/start_app.py,sha256=dV63nvBYEUl2sGeVxoUH4IJBXJSLecaq293i3alBWxo,794
202
215
  evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,775
203
216
  evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
204
217
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
205
- evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
206
- evalscope/collections/evaluator.py,sha256=3sz_bL0HMFkxq3C-4P6rNGrnQolifVISI5sEpT3Bt90,15754
218
+ evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCeR2Io,853
219
+ evalscope/collections/evaluator.py,sha256=NnLel9lOyR0wzOwxDGSCFWJN4zFx9ZA2hc0PI-FSvl0,16200
207
220
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
208
221
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
209
222
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
210
- evalscope/evaluator/evaluator.py,sha256=QzTFXiv_WdPpWTB3PgBNIz9KS_Rxu-fWDvoUpML23aA,21651
223
+ evalscope/evaluator/evaluator.py,sha256=d8cFq08oJ6kbKcwr4mVh517OxndgyqUrmuEP-bwmR6g,22071
211
224
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
212
225
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
213
226
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
214
- evalscope/metrics/__init__.py,sha256=y1sdj5FBKYW1q5kLC6QREzoITHwstJRUdji6p0X5aAE,1363
215
- evalscope/metrics/llm_judge.py,sha256=qYHsoBz-zXjL57Czl9CaPcyJT5SZr05giv5Q9SFK3cY,4000
216
- evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
227
+ evalscope/metrics/__init__.py,sha256=g96dZSt3Dh56TdVbe4yDqcfmr9DoLqH-R2__3Qvorjk,1497
228
+ evalscope/metrics/llm_judge.py,sha256=O2IaJpsBe1HqfCVnRYOt_PLWg6w85DYlYLU7yTq5idw,4384
229
+ evalscope/metrics/math_parser.py,sha256=JtOkj28XOtwoUACXOXLzCeRYz0rx0tBsQLQDU8cbC20,17311
217
230
  evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
218
231
  evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
219
232
  evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
220
233
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
221
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=m7v8ZysO9zCuyThEoGTe5QNVt2GsKMgZpH6du1FQCvg,12110
234
+ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=T91PgJfi1As7BR7I-Hq6rLlvHAtMB9JpBw9gMTH8VlE,12114
222
235
  evalscope/metrics/t2v_metrics/__init__.py,sha256=GBxgKTPVy_qhW_F3M4Oi6QMWhdAi4PqGX5w3t6Tueho,1783
223
236
  evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
224
237
  evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
@@ -329,16 +342,16 @@ evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj
329
342
  evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
330
343
  evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
331
344
  evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
332
- evalscope/models/adapters/server_adapter.py,sha256=d-0ne7ymWXmvKf_ypJ0093RNwplZJwhvU2xRwc8rt70,6581
345
+ evalscope/models/adapters/server_adapter.py,sha256=qdonCJLoM0qmFQtHziczUqVzA31p4AxIn2j9oNIosLw,6493
333
346
  evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
334
347
  evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
335
348
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
336
349
  evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
337
350
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
338
- evalscope/perf/arguments.py,sha256=5dTtaBR9BIobaKkX1Xj-mphHDG4uugnGaVOvWpLfN04,10714
339
- evalscope/perf/benchmark.py,sha256=eGnxMLQXSYBGRJS4tS8geSJAirnuWo35M4orlRZzei8,7847
351
+ evalscope/perf/arguments.py,sha256=HUKzcU-FBt34DgGJ0nc5rNgJAMpZwYQXMz8VU8jokco,10668
352
+ evalscope/perf/benchmark.py,sha256=qEgIX_Z4x3FNtAKTMlP2mRJTerRV5seCbVtB4XklnQI,7566
340
353
  evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
341
- evalscope/perf/main.py,sha256=clHzkQNmv7wv-OWkuNGDQ-8YoLUCWxARIX-Eisinpms,3096
354
+ evalscope/perf/main.py,sha256=yfJWGd2l4uU_qKW9bD6DzV0DK9XXuCJGLYjF_JWR22E,3394
342
355
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
343
356
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
344
357
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -348,7 +361,7 @@ evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4I
348
361
  evalscope/perf/plugin/api/openai_api.py,sha256=kTL_2OACuKhzd2W0Pf4DirpMumzk4V3rqKZ2mvBZVCs,7655
349
362
  evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
350
363
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
351
- evalscope/perf/plugin/datasets/custom.py,sha256=npreC7H1VsdTGYkqlMESvyOhtXOfZQA7_-ICmxe3FWk,936
364
+ evalscope/perf/plugin/datasets/custom.py,sha256=-meul2hRmYvYAo--c_EtCnItRi5DvN7xxFOpq6vqdts,1346
352
365
  evalscope/perf/plugin/datasets/flickr8k.py,sha256=MbJKEB0XqZE0nDEenwYs0FLH9QL658Vn9uQmUH4hPvk,1605
353
366
  evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
354
367
  evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
@@ -357,12 +370,12 @@ evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANB
357
370
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
358
371
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
359
372
  evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
360
- evalscope/perf/utils/benchmark_util.py,sha256=PcRTeKlEIslBw0zKVS2mFg6GgJ6J8m1f2-gAaEBeiHI,7236
373
+ evalscope/perf/utils/benchmark_util.py,sha256=EPKUDijue85b8KhSJoJKLh6comkTKRjq2yoEw4kxBho,7227
361
374
  evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
362
375
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
363
- evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
364
- evalscope/perf/utils/log_utils.py,sha256=Xm5A8g8BaozaI_0TaPzr2aAxUBCCf-w7II-FcifrIYg,1503
365
- evalscope/perf/utils/rich_display.py,sha256=SavP2L44UwN58ZUGR2W1wxM4h4F1iyPa90HhT-Ypkzs,8125
376
+ evalscope/perf/utils/local_server.py,sha256=RL9rGd5tEniZ0aErhHcbVXMX22YmujfE11T3j37VL8k,4684
377
+ evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
378
+ evalscope/perf/utils/rich_display.py,sha256=xZzeryQbYM6Cv8g1ulK6OQUE2CalQ_KtFxiy7pioeEU,8127
366
379
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
367
380
  evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
368
381
  evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -384,12 +397,10 @@ evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNi
384
397
  evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
385
398
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
386
399
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
387
- evalscope/report/__init__.py,sha256=iLNqx7CnHSHQmOBqWUK_vt2VIjnvGslJTqn--7B4y_s,316
388
- evalscope/report/app.py,sha256=FxNpiEmbpH_B7D5SYN42idGsyOgkgFrLzScOVrwL3SI,28998
389
- evalscope/report/app_arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
390
- evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
391
- evalscope/report/generator.py,sha256=q9aHWNjQgvutAKtpjfWOpfu5zNFdnXilO9OqBqt_Phg,3612
392
- evalscope/report/utils.py,sha256=uu-rAzoN6ZIlv52IDWSZCcmNVY3DscNo2f9H9-gjZHY,4602
400
+ evalscope/report/__init__.py,sha256=mLCgT7G-WPagQHOGz97AOdLQJjyikrswDiXA8d9Wr_Q,923
401
+ evalscope/report/combinator.py,sha256=xGX0B6tGZxaEB20tziPQm3HUkvgftghKg5AEQ8JpsBE,2842
402
+ evalscope/report/generator.py,sha256=oykmQROG-Bt8ttCH4RtvmGJ39HmDJMTU6gG26lg5LHE,4321
403
+ evalscope/report/utils.py,sha256=KAc4Cq8NMxTUjCJHI5MK3ZqzBNjfDMXrwLBpUkaywjk,6520
393
404
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
394
405
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
395
406
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -433,22 +444,22 @@ evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-J
433
444
  evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
434
445
  evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
435
446
  evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
436
- evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
447
+ evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
437
448
  evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
438
- evalscope/utils/utils.py,sha256=hP_ntROFsZ-zaNVpJtT2prNo8iX-UAKfRtdxbLtPJng,11105
449
+ evalscope/utils/utils.py,sha256=P5gmpINv5UQrwEMrFZKZjdJspsOdGjaBARfRSDVNOd0,11414
439
450
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
440
451
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
441
452
  tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
442
- tests/aigc/test_t2i.py,sha256=BcdS3OMypWnraXF4Cq3DhDVRpZq0qo9_0Qpyg54B7FY,2627
453
+ tests/aigc/test_t2i.py,sha256=YjEAwlM8cBfGCGOguz86UebJjJ5bsc3jhs4SQqyxwZs,3844
443
454
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
444
- tests/cli/test_all.py,sha256=O3lXwOV7A0f0rmltofrjpphnshjNtaZC6NUPG-wsQjg,4082
445
- tests/cli/test_collection.py,sha256=_11mSCWLaiCgheA3uguv6uI3CxqaHUKVwzS6T5BGmxs,4145
446
- tests/cli/test_run.py,sha256=FTFiAb8Ge5raB1aa0Nzw8DPjFLyAlLfXHRQVIWjvvGE,17798
455
+ tests/cli/test_all.py,sha256=noGE54iWnmoPGTsN2PGh7_jM5ceehN6bMnp6xxq4s3A,4240
456
+ tests/cli/test_collection.py,sha256=H7enYWGTmp2VRio-WTEfPRdkf3y-T4fs43Kqf81mbrQ,4181
457
+ tests/cli/test_run.py,sha256=OER_I6FeJAMUA2IN0zKUdUIeRDr8mJFaOiEpwQjYbnE,18166
447
458
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
448
459
  tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
449
460
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
450
461
  tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
451
- tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
462
+ tests/rag/test_mteb.py,sha256=PaWS5GrZdMO680M129QP2EG000rVq7f2iP3n0YDAv-w,5611
452
463
  tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
453
464
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
454
465
  tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
@@ -456,9 +467,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
456
467
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
457
468
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
458
469
  tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
459
- evalscope-0.16.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
460
- evalscope-0.16.0.dist-info/METADATA,sha256=zX2L_cLxOjX-NNbiR40dmPOxUWyOH86zJycYjr4j5Po,35492
461
- evalscope-0.16.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
462
- evalscope-0.16.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
463
- evalscope-0.16.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
464
- evalscope-0.16.0.dist-info/RECORD,,
470
+ evalscope-0.16.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
471
+ evalscope-0.16.1.dist-info/METADATA,sha256=H8eaMzt6o5k2wFIKnwBdTCPXnAexGvM-0PQqc16iKI4,36244
472
+ evalscope-0.16.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
473
+ evalscope-0.16.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
474
+ evalscope-0.16.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
475
+ evalscope-0.16.1.dist-info/RECORD,,
tests/aigc/test_t2i.py CHANGED
@@ -11,7 +11,7 @@ from evalscope.run import run_task
11
11
  from evalscope.utils import test_level_list
12
12
  from evalscope.utils.logger import get_logger
13
13
 
14
- os.environ['LOG_LEVEL'] = 'DEBUG'
14
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
15
15
 
16
16
  logger = get_logger()
17
17
 
@@ -58,9 +58,9 @@ class TestRun(unittest.TestCase):
58
58
  'torch_dtype': 'torch.float16',
59
59
  },
60
60
  datasets=[
61
- 'tifa160',
61
+ # 'tifa160',
62
62
  # 'genai_bench',
63
- # 'evalmuse',
63
+ 'evalmuse',
64
64
  # 'hpdv2',
65
65
  ],
66
66
  dataset_args={
@@ -85,3 +85,40 @@ class TestRun(unittest.TestCase):
85
85
  )
86
86
 
87
87
  run_task(task_cfg=task_cfg)
88
+
89
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
90
+ def test_run_benchmark_flux(self):
91
+
92
+ task_cfg = TaskConfig(
93
+ model='black-forest-labs/FLUX.1-dev', # model on modelscope
94
+ model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
95
+ model_args={
96
+ 'torch_dtype': 'torch.float16',
97
+ },
98
+ datasets=[
99
+ # 'tifa160',
100
+ # 'genai_bench',
101
+ 'evalmuse',
102
+ # 'hpdv2',
103
+ ],
104
+ dataset_args={
105
+ 'tifa160': {
106
+ 'metric_list': [
107
+ 'PickScore',
108
+ # 'CLIPScore',
109
+ # 'HPSv2Score',
110
+ # 'BLIPv2Score',
111
+ # 'ImageRewardScore',
112
+ # 'VQAScore',
113
+ # 'FGA_BLIP2Score',
114
+ ]
115
+ }
116
+ },
117
+ generation_config={
118
+ 'num_inference_steps': 50,
119
+ 'guidance_scale': 3.5
120
+ },
121
+ use_cache='outputs/20250520_112314'
122
+ )
123
+
124
+ run_task(task_cfg=task_cfg)
tests/cli/test_all.py CHANGED
@@ -12,43 +12,46 @@ from evalscope.run import run_task
12
12
  from evalscope.utils import test_level_list
13
13
  from evalscope.utils.logger import get_logger
14
14
 
15
- os.environ['LOG_LEVEL'] = 'DEBUG'
15
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
16
16
 
17
17
  logger = get_logger()
18
18
 
19
19
  datasets=[
20
- 'iquiz',
21
- 'ifeval',
22
- 'mmlu',
23
- 'mmlu_pro',
24
- 'musr',
25
- 'process_bench',
26
- 'race',
27
- 'trivia_qa',
28
- 'cmmlu',
29
- 'humaneval',
30
- 'gsm8k',
31
- 'bbh',
32
- 'competition_math',
33
- 'math_500',
34
- 'aime24',
35
- 'gpqa',
36
- 'arc',
37
- 'ceval',
38
- 'hellaswag',
39
- 'general_mcq',
40
- 'general_qa',
41
- 'super_gpqa',
42
- 'live_code_bench',
43
- 'mmlu_redux',
44
- 'simple_qa',
45
- 'chinese_simpleqa',
46
- 'alpaca_eval',
47
- 'arena_hard',
48
- 'maritime_bench',
49
- 'drop',
50
- 'winogrande',
51
- 'tool_bench',
20
+ # 'iquiz',
21
+ # 'ifeval',
22
+ # 'mmlu',
23
+ # 'mmlu_pro',
24
+ # 'musr',
25
+ # 'process_bench',
26
+ # 'race',
27
+ # 'trivia_qa',
28
+ # 'cmmlu',
29
+ # 'humaneval',
30
+ # 'gsm8k',
31
+ # 'bbh',
32
+ # 'competition_math',
33
+ # 'math_500',
34
+ # 'aime24',
35
+ # 'gpqa',
36
+ # 'arc',
37
+ # 'ceval',
38
+ # 'hellaswag',
39
+ # 'general_mcq',
40
+ # 'general_qa',
41
+ # 'super_gpqa',
42
+ # 'live_code_bench',
43
+ # 'mmlu_redux',
44
+ # 'simple_qa',
45
+ # 'chinese_simpleqa',
46
+ # 'alpaca_eval',
47
+ # 'arena_hard',
48
+ # 'maritime_bench',
49
+ # 'drop',
50
+ # 'winogrande',
51
+ # 'tool_bench',
52
+ 'frames',
53
+ 'docmath',
54
+ 'needle_haystack'
52
55
  ]
53
56
 
54
57
  dataset_args={
@@ -131,7 +134,7 @@ class TestRun(unittest.TestCase):
131
134
  from evalscope.config import TaskConfig
132
135
 
133
136
  task_cfg = TaskConfig(
134
- model='qwen2.5-0.5b-instruct',
137
+ model='qwen-plus',
135
138
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
136
139
  api_key= env.get('DASHSCOPE_API_KEY'),
137
140
  eval_type=EvalType.SERVICE,
@@ -145,9 +148,10 @@ class TestRun(unittest.TestCase):
145
148
  'n': 1,
146
149
  'max_tokens': 4096,
147
150
  },
151
+ judge_worker_num=5,
148
152
  judge_strategy=JudgeStrategy.AUTO,
149
153
  judge_model_args={
150
- 'model_id': 'qwen2.5-7b-instruct',
154
+ 'model_id': 'qwen2.5-72b-instruct',
151
155
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
152
156
  'api_key': env.get('DASHSCOPE_API_KEY'),
153
157
  }
@@ -72,14 +72,15 @@ class TestCollection(unittest.TestCase):
72
72
  'local_path': 'outputs/mixed_data_test.jsonl'
73
73
  # 'local_path': 'outputs/weighted_mixed_data.jsonl'
74
74
  }},
75
- limit=10,
76
- judge_strategy=JudgeStrategy.LLM_RECALL,
75
+ limit=5,
76
+ judge_strategy=JudgeStrategy.AUTO,
77
77
  judge_model_args={
78
- 'model_id': 'qwen2.5-7b-instruct',
79
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
- 'api_key': os.getenv('DASHSCOPE_API_KEY'),
78
+ # 'model_id': 'qwen2.5-72b-instruct',
79
+ # 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
+ # 'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
81
  },
82
- use_cache='outputs/20250519_114427'
82
+ analysis_report=True,
83
+ # use_cache='outputs/20250522_204520'
83
84
  )
84
85
  res = run_task(task_cfg=task_cfg)
85
86
  print(res)