evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (46) hide show
  1. evalscope/arguments.py +10 -0
  2. evalscope/backend/rag_eval/utils/llm.py +1 -1
  3. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  4. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  5. evalscope/benchmarks/data_adapter.py +4 -2
  6. evalscope/benchmarks/drop/__init__.py +0 -0
  7. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  8. evalscope/benchmarks/drop/utils.py +59 -0
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  10. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  11. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  12. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
  13. evalscope/benchmarks/tool_bench/utils.py +202 -0
  14. evalscope/benchmarks/utils.py +3 -2
  15. evalscope/benchmarks/winogrande/__init__.py +0 -0
  16. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  17. evalscope/collections/evaluator.py +76 -26
  18. evalscope/config.py +46 -15
  19. evalscope/evaluator/evaluator.py +43 -15
  20. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  21. evalscope/metrics/llm_judge.py +3 -3
  22. evalscope/metrics/rouge_metric.py +11 -13
  23. evalscope/models/adapters/chat_adapter.py +51 -34
  24. evalscope/models/adapters/server_adapter.py +15 -19
  25. evalscope/perf/arguments.py +14 -5
  26. evalscope/perf/benchmark.py +0 -6
  27. evalscope/perf/main.py +65 -15
  28. evalscope/perf/utils/benchmark_util.py +33 -15
  29. evalscope/perf/utils/db_util.py +25 -15
  30. evalscope/perf/utils/log_utils.py +1 -1
  31. evalscope/perf/utils/rich_display.py +186 -0
  32. evalscope/report/app.py +47 -34
  33. evalscope/report/utils.py +1 -1
  34. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  35. evalscope/utils/deprecation_utils.py +42 -0
  36. evalscope/version.py +2 -2
  37. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
  38. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
  39. tests/cli/test_all.py +3 -0
  40. tests/cli/test_collection.py +2 -1
  41. tests/cli/test_run.py +28 -12
  42. tests/perf/test_perf.py +23 -0
  43. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
  44. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
  45. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
  46. {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
evalscope/report/app.py CHANGED
@@ -223,6 +223,33 @@ def plot_multi_report_radar(df: pd.DataFrame):
223
223
  return fig
224
224
 
225
225
 
226
+ def convert_markdown_image(text):
227
+ if not os.path.isfile(text):
228
+ return text
229
+ # Convert the image path to a markdown image tag
230
+ if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
231
+ text = os.path.abspath(text)
232
+ image_tag = f'![image](gradio_api/file={text})'
233
+ logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
234
+ return image_tag
235
+ return text
236
+
237
+
238
+ def convert_html_tags(text):
239
+ # match begin label
240
+ text = re.sub(r'<(\w+)>', r'[\1]', text)
241
+ # match end label
242
+ text = re.sub(r'</(\w+)>', r'[/\1]', text)
243
+ return text
244
+
245
+
246
+ def process_string(string: str, max_length: int = 2048) -> str:
247
+ string = convert_html_tags(string) # for display labels e.g.
248
+ if max_length and len(string) > max_length:
249
+ return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
250
+ return string
251
+
252
+
226
253
  def dict_to_markdown(data) -> str:
227
254
  markdown_lines = []
228
255
 
@@ -230,55 +257,41 @@ def dict_to_markdown(data) -> str:
230
257
  bold_key = f'**{key}**'
231
258
 
232
259
  if isinstance(value, list):
233
- value_str = '\n' + '\n'.join([f' - {item}' for item in value])
260
+ value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
234
261
  elif isinstance(value, dict):
235
262
  value_str = dict_to_markdown(value)
236
263
  else:
237
264
  value_str = str(value)
238
265
 
239
- value_str = process_string(value_str)
240
- markdown_line = f'{bold_key}: {value_str}'
266
+ value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
267
+ markdown_line = f'{bold_key}:\n{value_str}'
241
268
  markdown_lines.append(markdown_line)
242
269
 
243
270
  return '\n\n'.join(markdown_lines)
244
271
 
245
272
 
246
- def convert_html_tags(text):
247
- # match begin label
248
- text = re.sub(r'<(\w+)>', r'[\1]', text)
249
- # match end label
250
- text = re.sub(r'</(\w+)>', r'[/\1]', text)
251
- return text
252
-
253
-
254
- def convert_markdown_image(text):
255
- if not os.path.isfile(text):
256
- return text
257
- # Convert the image path to a markdown image tag
258
- if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
259
- text = os.path.abspath(text)
260
- image_tag = f'![image](gradio_api/file={text})'
261
- logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
262
- return image_tag
263
- return text
273
+ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
274
+ """
275
+ Process model prediction output into a formatted string.
264
276
 
277
+ Args:
278
+ item: The item to process. Can be a string, list, or dictionary.
279
+ max_length: The maximum length of the output string.
265
280
 
266
- def process_string(string: str, max_length: int = 2048) -> str:
267
- string = convert_html_tags(string) # for display labels e.g. `<think>`
268
- if len(string) > max_length:
269
- return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
270
- return string
271
-
272
-
273
- def process_model_prediction(item: Any):
281
+ Returns:
282
+ A formatted string representation of the input.
283
+ """
274
284
  if isinstance(item, dict):
275
- res = dict_to_markdown(item)
276
- return process_string(res)
285
+ result = dict_to_markdown(item)
277
286
  elif isinstance(item, list):
278
- res = '\n'.join([process_model_prediction(item) for item in item])
279
- return process_string(res)
287
+ result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
280
288
  else:
281
- return process_string(str(item))
289
+ result = str(item)
290
+
291
+ # Apply HTML tag conversion and truncation only at the final output
292
+ if max_length is not None:
293
+ return process_string(result, max_length)
294
+ return result
282
295
 
283
296
 
284
297
  def normalize_score(score):
evalscope/report/utils.py CHANGED
@@ -96,7 +96,7 @@ class Report:
96
96
 
97
97
  @classmethod
98
98
  def from_json(cls, json_file: str):
99
- with open(json_file, 'r') as f:
99
+ with open(json_file, 'r', encoding='utf-8') as f:
100
100
  data = json.load(f)
101
101
  return cls.from_dict(data)
102
102
 
@@ -6,11 +6,12 @@ from typing import Union
6
6
  from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
7
7
  from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
8
8
  from evalscope.utils import get_logger
9
+ from evalscope.utils.deprecation_utils import deprecated
9
10
  from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
10
11
 
11
12
  logger = get_logger()
12
13
 
13
-
14
+ @deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
14
15
  def run_task(task_cfg: Union[str, dict]):
15
16
 
16
17
  if isinstance(task_cfg, str):
@@ -0,0 +1,42 @@
1
+ import functools
2
+ import inspect
3
+ from typing import Callable, Optional
4
+
5
+ from .logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
11
+ """
12
+ Decorator to mark functions as deprecated.
13
+
14
+ :param since: String indicating the version since deprecation
15
+ :param remove_in: Optional string indicating the version when it will be removed
16
+ :param alternative: Optional string suggesting an alternative
17
+ :return: Decorated function
18
+ """
19
+
20
+ def decorator(func: Callable) -> Callable:
21
+
22
+ @functools.wraps(func)
23
+ def wrapper(*args, **kwargs):
24
+ # Get the file name where the function is defined
25
+ file_name = inspect.getfile(func)
26
+
27
+ # Construct the warning message
28
+ warning_parts = [
29
+ f'{func.__name__} in {file_name} has been deprecated since version {since}',
30
+ f'and will be removed in version {remove_in}' if remove_in else None,
31
+ f'Use {alternative} instead' if alternative else None
32
+ ]
33
+ warning_message = '. '.join(filter(None, warning_parts))
34
+
35
+ # Log the warning
36
+ logger.warning(warning_message)
37
+
38
+ return func(*args, **kwargs)
39
+
40
+ return wrapper
41
+
42
+ return decorator
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.15.1'
4
- __release_datetime__ = '2025-04-30 12:00:00'
3
+ __version__ = '0.16.0'
4
+ __release_datetime__ = '2025-05-19 18:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.15.1
3
+ Version: 0.16.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -91,9 +91,10 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
91
91
  Requires-Dist: aiohttp; extra == "all"
92
92
  Requires-Dist: fastapi; extra == "all"
93
93
  Requires-Dist: numpy; extra == "all"
94
+ Requires-Dist: rich; extra == "all"
94
95
  Requires-Dist: sse-starlette; extra == "all"
95
96
  Requires-Dist: transformers; extra == "all"
96
- Requires-Dist: unicorn; extra == "all"
97
+ Requires-Dist: uvicorn; extra == "all"
97
98
  Requires-Dist: gradio==5.4.0; extra == "all"
98
99
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
99
100
  Requires-Dist: diffusers; extra == "all"
@@ -110,9 +111,10 @@ Provides-Extra: perf
110
111
  Requires-Dist: aiohttp; extra == "perf"
111
112
  Requires-Dist: fastapi; extra == "perf"
112
113
  Requires-Dist: numpy; extra == "perf"
114
+ Requires-Dist: rich; extra == "perf"
113
115
  Requires-Dist: sse-starlette; extra == "perf"
114
116
  Requires-Dist: transformers; extra == "perf"
115
- Requires-Dist: unicorn; extra == "perf"
117
+ Requires-Dist: uvicorn; extra == "perf"
116
118
  Provides-Extra: rag
117
119
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
118
120
  Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
@@ -177,9 +179,23 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
177
179
 
178
180
  ## 📝 Introduction
179
181
 
180
- EvalScope is [ModelScope](https://modelscope.cn/)'s official framework for model evaluation and benchmarking, designed for diverse assessment needs. It supports various model types including large language models, multimodal, embedding, reranker, and CLIP models.
182
+ EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
181
183
 
182
- The framework accommodates multiple evaluation scenarios such as end-to-end RAG evaluation, arena mode, and inference performance testing. It features built-in benchmarks and metrics like MMLU, CMMLU, C-Eval, and GSM8K. Seamlessly integrated with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, EvalScope enables one-click evaluations, offering comprehensive support for model training and assessment 🚀
184
+ - 🧠 Large Language Models
185
+ - 🎨 Multimodal Models
186
+ - 🔍 Embedding Models
187
+ - 🏆 Reranker Models
188
+ - 🖼️ CLIP Models
189
+ - 🎭 AIGC Models (Image-to-Text/Video)
190
+ - ...and more!
191
+
192
+ EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
193
+
194
+ - 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
195
+ - 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
196
+ - 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
197
+
198
+ Below is the overall architecture diagram of EvalScope:
183
199
 
184
200
  <p align="center">
185
201
  <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
@@ -214,6 +230,8 @@ Please scan the QR code below to join our community groups:
214
230
 
215
231
  ## 🎉 News
216
232
 
233
+ - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
234
+ - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
217
235
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
218
236
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
219
237
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
@@ -479,26 +497,27 @@ For more customized evaluations, such as customizing model parameters or dataset
479
497
 
480
498
  ```shell
481
499
  evalscope eval \
482
- --model Qwen/Qwen2.5-0.5B-Instruct \
483
- --model-args revision=master,precision=torch.float16,device_map=auto \
484
- --generation-config do_sample=true,temperature=0.5 \
500
+ --model Qwen/Qwen3-0.6B \
501
+ --model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
502
+ --generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
485
503
  --dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
486
504
  --datasets gsm8k \
487
505
  --limit 10
488
506
  ```
489
507
 
490
- ### Parameter
491
- - `--model-args`: Model loading parameters, separated by commas in `key=value` format. Default parameters:
492
- - `revision`: Model version, default is `master`
493
- - `precision`: Model precision, default is `auto`
494
- - `device_map`: Model device allocation, default is `auto`
495
- - `--generation-config`: Generation parameters, separated by commas in `key=value` format. Default parameters:
496
- - `do_sample`: Whether to use sampling, default is `false`
497
- - `max_length`: Maximum length, default is 2048
498
- - `max_new_tokens`: Maximum length of generation, default is 512
499
- - `--dataset-args`: Configuration parameters for evaluation datasets, passed in `json` format. The key is the dataset name, and the value is the parameters. Note that it needs to correspond one-to-one with the values in the `--datasets` parameter:
508
+ ### Parameter Description
509
+ - `--model-args`: Model loading parameters, passed as a JSON string:
510
+ - `revision`: Model version
511
+ - `precision`: Model precision
512
+ - `device_map`: Device allocation for the model
513
+ - `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
514
+ - `do_sample`: Whether to use sampling
515
+ - `temperature`: Generation temperature
516
+ - `max_new_tokens`: Maximum length of generated tokens
517
+ - `chat_template_kwargs`: Model inference template parameters
518
+ - `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
500
519
  - `few_shot_num`: Number of few-shot examples
501
- - `few_shot_random`: Whether to randomly sample few-shot data, if not set, defaults to `true`
520
+ - `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
502
521
 
503
522
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
504
523
 
@@ -517,6 +536,11 @@ A stress testing tool focused on large language models, which can be customized
517
536
 
518
537
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
519
538
 
539
+ **Output example**
540
+
541
+ ![multi_perf](docs/en/user_guides/stress_test/images/multi_perf.png)
542
+
543
+
520
544
  **Supports wandb for recording results**
521
545
 
522
546
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
@@ -565,7 +589,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
565
589
  </a>
566
590
 
567
591
  ## 🔜 Roadmap
568
- - [ ] Support for better evaluation report visualization
592
+ - [x] Support for better evaluation report visualization
569
593
  - [x] Support for mixed evaluations across multiple datasets
570
594
  - [x] RAG evaluation
571
595
  - [x] VLM evaluation
@@ -575,7 +599,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
575
599
  - [x] Multi-modal evaluation
576
600
  - [ ] Benchmarks
577
601
  - [ ] GAIA
578
- - [ ] GPQA
602
+ - [x] GPQA
579
603
  - [x] MBPP
580
604
 
581
605
 
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=jywTxu_HWhgf0_OlnaOyRSzUHenr5Zio2vmcCgcfbxg,5453
3
- evalscope/config.py,sha256=O3kjjVFRGSrlLD5EI4t99Z-m6oFtQVmEudvE62x92wY,9648
2
+ evalscope/arguments.py,sha256=fZW-om5E2_JaFcEmkvahvundjedPLgIDde-zwDXinG0,5868
3
+ evalscope/config.py,sha256=19QaZ5VS8wknt4sLBxiZkR6pH-nm4Ph3Kl-1bZgcQcE,10799
4
4
  evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
5
5
  evalscope/run.py,sha256=_DKbxgQGwhweBnQrI7lQhu5eoz4LYPVeNanzD4lHuJA,6476
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
8
- evalscope/version.py,sha256=eFCP5Hfk4dip59uCASefVxaNqxWNtwDQPrqaoRJxO9c,119
8
+ evalscope/version.py,sha256=8STVV6Y877B3esrgvovInSk4IFNzxZ_ZEz9ND_6B2lQ,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -50,14 +50,14 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVf
50
50
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
52
52
  evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
53
- evalscope/backend/rag_eval/utils/llm.py,sha256=acaD5QHPJUstJGpW1sNJ-3ZPT5J_Z8beOWb61Rtz07U,2607
53
+ evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
54
54
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
55
55
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
57
57
  evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
58
58
  evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
59
- evalscope/benchmarks/data_adapter.py,sha256=mWdxtHbordS577NqZUQZmIjlewjGDlStqc-iDvqpAyU,18061
60
- evalscope/benchmarks/utils.py,sha256=yXQyszzrILNiBuUrbB1BtgotQSaNA8w6X935AL1dNAw,1074
59
+ evalscope/benchmarks/data_adapter.py,sha256=lcBoXhI1Byn0HcwbVxmIeUFxZlz_wiqte6RDPOR8sbM,18184
60
+ evalscope/benchmarks/utils.py,sha256=jB9w3mN1eOur6j2kpQB_XZJ912fhzC0GaSeHOoylK7M,1087
61
61
  evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
62
  evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
@@ -70,7 +70,7 @@ evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
70
70
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=GrIxCHpUwgUy8tXGTB7iQOt8k7wG8MJB0CWbwBmIy-8,1703
71
71
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=yxo5roCb8ryX9ROUU2FdZ-WBTUPZ14MrBzEL0zPOh-U,1718
72
72
  evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=em1YM2PxnJ8Of7Li3eqrw8PtwfeXSinfVIr-CIKVb60,4026
73
+ evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=oUHpWrt5Gx0jF80RBd7zTh_1AWI66YvDd6U1vOMoqj0,3828
74
74
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
75
75
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
76
76
  evalscope/benchmarks/arc/arc_adapter.py,sha256=0h-eT4BBmUJQrakKMPUNE1nSRwK6LHB-cflWpWzY978,6364
@@ -110,7 +110,7 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
110
110
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1ITBXI0f01Dt1p7sb2RGswIeg9685Bkk2S2xmA1vat8,11295
111
111
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
112
112
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=zY8dfvrTeCHAQ3d7AM02CexZw5CVKH51ZOhtT7Q1Gko,8031
113
+ evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=Q6ncuLrCUrrhhljIfMsgWnyhHfcWWwh8iA6NZvz3W28,8079
114
114
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
115
115
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
116
116
  evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=r9zael_Y2Jso0ashevYpF8e5SHOBh8iMcPIJU5WT3pQ,10367
@@ -120,10 +120,13 @@ evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc
120
120
  evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=wgejW-_QswtT8_3JKAQ_H6svH8IotDJDBEH7X4nP4bY,6760
121
121
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
122
  evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3q53T-lu1UWTV6T4h1cKGoCQDh0O4QxFezw,2569
123
+ evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
+ evalscope/benchmarks/drop/drop_adapter.py,sha256=V-Vx6g2_1kcDUDWOKVX1vPSLt5iHn8NQkpWbsIwPaa4,8325
125
+ evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
123
126
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
127
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
125
128
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
126
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=I2BanmO4WLrKviyLiIeqmS5mdyjqGg1X7hauv4HBjgk,4653
129
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=40mZovspVf-OXcuEu3ei6G_HZlYA8whAHSESHPPONxA,4750
127
130
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
131
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
129
132
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -174,12 +177,15 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
174
177
  evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
175
178
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
176
179
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
177
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=to4kSKc29BmtG4q9R2PeM-sdHiL8toSyoVi1D9WMRKk,8949
180
+ evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=TD7hkMLGZ4GK7wD7cwqJ3jCcTAaixOakUy3o5DaPYHI,8997
178
181
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
182
  evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
180
183
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
181
184
  evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
182
185
  evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
186
+ evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
187
+ evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=l2dBcJ4Z3m-8QFtfyFH4IqMtvkY3Rfk021P9Ff_lXWQ,2270
188
+ evalscope/benchmarks/tool_bench/utils.py,sha256=vIPsL8FmMF2JZRHCZeLS_dDeATKNRvZDbq6T-Znlk8Q,7025
183
189
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
184
190
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
185
191
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
@@ -187,6 +193,8 @@ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb
187
193
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
188
194
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
189
195
  evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
196
+ evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
197
+ evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=UdANz3YmCtV2YfGuEihTe3vpUTlIxeXBhIqGkKbTFdU,1956
190
198
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
191
199
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
192
200
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
@@ -195,22 +203,22 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
195
203
  evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
196
204
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
197
205
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
198
- evalscope/collections/evaluator.py,sha256=Ll-qLet04aEp1WxoCKAuvZVWEZuy1lS_D-vZIN3zSQQ,13425
206
+ evalscope/collections/evaluator.py,sha256=3sz_bL0HMFkxq3C-4P6rNGrnQolifVISI5sEpT3Bt90,15754
199
207
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
200
208
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
201
209
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
202
- evalscope/evaluator/evaluator.py,sha256=oOVYRMMQfT3fqu-l33wmJtKlyeWxwoIUADMCoBNARTM,20271
210
+ evalscope/evaluator/evaluator.py,sha256=QzTFXiv_WdPpWTB3PgBNIz9KS_Rxu-fWDvoUpML23aA,21651
203
211
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
204
212
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
205
213
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
206
214
  evalscope/metrics/__init__.py,sha256=y1sdj5FBKYW1q5kLC6QREzoITHwstJRUdji6p0X5aAE,1363
207
- evalscope/metrics/llm_judge.py,sha256=MjyTC-xiSThk8Rd4IdUbsCXeeikoOORv6wt8H7SW8s4,4008
215
+ evalscope/metrics/llm_judge.py,sha256=qYHsoBz-zXjL57Czl9CaPcyJT5SZr05giv5Q9SFK3cY,4000
208
216
  evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
209
217
  evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
210
218
  evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
211
- evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
219
+ evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
212
220
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
213
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
221
+ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=m7v8ZysO9zCuyThEoGTe5QNVt2GsKMgZpH6du1FQCvg,12110
214
222
  evalscope/metrics/t2v_metrics/__init__.py,sha256=GBxgKTPVy_qhW_F3M4Oi6QMWhdAi4PqGX5w3t6Tueho,1783
215
223
  evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
216
224
  evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
@@ -318,19 +326,19 @@ evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,630
318
326
  evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
319
327
  evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
320
328
  evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
321
- evalscope/models/adapters/chat_adapter.py,sha256=hzFrpvIrakKO5hsnbdXiDTO0cGajAdhcAN9ENoI6XvY,7312
329
+ evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
322
330
  evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
323
331
  evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
324
- evalscope/models/adapters/server_adapter.py,sha256=5kH1yDAjETogR7aOdnCEueYE1bREI40OdXdBiJpMdIM,6734
332
+ evalscope/models/adapters/server_adapter.py,sha256=d-0ne7ymWXmvKf_ypJ0093RNwplZJwhvU2xRwc8rt70,6581
325
333
  evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
326
334
  evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
327
335
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
328
336
  evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
329
337
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
- evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
331
- evalscope/perf/benchmark.py,sha256=C0tLaZzxqMonZK4iLtfjiQIxX3tO3-uFrOjgV-oVsU0,8024
338
+ evalscope/perf/arguments.py,sha256=5dTtaBR9BIobaKkX1Xj-mphHDG4uugnGaVOvWpLfN04,10714
339
+ evalscope/perf/benchmark.py,sha256=eGnxMLQXSYBGRJS4tS8geSJAirnuWo35M4orlRZzei8,7847
332
340
  evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
333
- evalscope/perf/main.py,sha256=C7iNEdb4SEMGmHsF4DHAak4O1zRxrWW1tMRmyhEkVwQ,1376
341
+ evalscope/perf/main.py,sha256=clHzkQNmv7wv-OWkuNGDQ-8YoLUCWxARIX-Eisinpms,3096
334
342
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
335
343
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
336
344
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -349,11 +357,12 @@ evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANB
349
357
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
350
358
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
351
359
  evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
352
- evalscope/perf/utils/benchmark_util.py,sha256=CftjnxYA7d1aeAL_iuyXcJPwCL5A8zWGZSkNtjrMyW8,6309
353
- evalscope/perf/utils/db_util.py,sha256=VsYgz6IsSNPAWGCopOOIxAUhUat3GRbZMlrfdZ6i4kM,9575
360
+ evalscope/perf/utils/benchmark_util.py,sha256=PcRTeKlEIslBw0zKVS2mFg6GgJ6J8m1f2-gAaEBeiHI,7236
361
+ evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
354
362
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
355
363
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
356
- evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
364
+ evalscope/perf/utils/log_utils.py,sha256=Xm5A8g8BaozaI_0TaPzr2aAxUBCCf-w7II-FcifrIYg,1503
365
+ evalscope/perf/utils/rich_display.py,sha256=SavP2L44UwN58ZUGR2W1wxM4h4F1iyPa90HhT-Ypkzs,8125
357
366
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
358
367
  evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
359
368
  evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -376,11 +385,11 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
376
385
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
377
386
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
378
387
  evalscope/report/__init__.py,sha256=iLNqx7CnHSHQmOBqWUK_vt2VIjnvGslJTqn--7B4y_s,316
379
- evalscope/report/app.py,sha256=8pcQi5oYAYa9hXoMoMUNfy9jSvSR9DDiXyLcyPd9AmA,28459
388
+ evalscope/report/app.py,sha256=FxNpiEmbpH_B7D5SYN42idGsyOgkgFrLzScOVrwL3SI,28998
380
389
  evalscope/report/app_arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
381
390
  evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
382
391
  evalscope/report/generator.py,sha256=q9aHWNjQgvutAKtpjfWOpfu5zNFdnXilO9OqBqt_Phg,3612
383
- evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
392
+ evalscope/report/utils.py,sha256=uu-rAzoN6ZIlv52IDWSZCcmNVY3DscNo2f9H9-gjZHY,4602
384
393
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
385
394
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
386
395
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -413,13 +422,14 @@ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2ee
413
422
  evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
414
423
  evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
415
424
  evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
416
- evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
425
+ evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
417
426
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
418
427
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
419
428
  evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
420
429
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
421
430
  evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
422
431
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
432
+ evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
423
433
  evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
424
434
  evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
425
435
  evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
@@ -431,11 +441,11 @@ tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
431
441
  tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
432
442
  tests/aigc/test_t2i.py,sha256=BcdS3OMypWnraXF4Cq3DhDVRpZq0qo9_0Qpyg54B7FY,2627
433
443
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
434
- tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
435
- tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
436
- tests/cli/test_run.py,sha256=1DHLFlgGvHJizbLVc1ShcGFAHirEPgW8r88H7g8Sbx4,17245
444
+ tests/cli/test_all.py,sha256=O3lXwOV7A0f0rmltofrjpphnshjNtaZC6NUPG-wsQjg,4082
445
+ tests/cli/test_collection.py,sha256=_11mSCWLaiCgheA3uguv6uI3CxqaHUKVwzS6T5BGmxs,4145
446
+ tests/cli/test_run.py,sha256=FTFiAb8Ge5raB1aa0Nzw8DPjFLyAlLfXHRQVIWjvvGE,17798
437
447
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
438
- tests/perf/test_perf.py,sha256=diwwEmoWR-6xSVeGF65J6TWHRNj54rkwyvnhHh7PiE0,3919
448
+ tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
439
449
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
440
450
  tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
441
451
  tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
@@ -446,9 +456,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
446
456
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
447
457
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
448
458
  tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
449
- evalscope-0.15.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
450
- evalscope-0.15.1.dist-info/METADATA,sha256=JvRF5sI_9ak9Y-FwWdU1Y8BE96iKPLO_hIGC7Z9SWpg,34080
451
- evalscope-0.15.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
452
- evalscope-0.15.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
453
- evalscope-0.15.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
454
- evalscope-0.15.1.dist-info/RECORD,,
459
+ evalscope-0.16.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
460
+ evalscope-0.16.0.dist-info/METADATA,sha256=zX2L_cLxOjX-NNbiR40dmPOxUWyOH86zJycYjr4j5Po,35492
461
+ evalscope-0.16.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
462
+ evalscope-0.16.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
463
+ evalscope-0.16.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
464
+ evalscope-0.16.0.dist-info/RECORD,,
tests/cli/test_all.py CHANGED
@@ -46,6 +46,9 @@ datasets=[
46
46
  'alpaca_eval',
47
47
  'arena_hard',
48
48
  'maritime_bench',
49
+ 'drop',
50
+ 'winogrande',
51
+ 'tool_bench',
49
52
  ]
50
53
 
51
54
  dataset_args={
@@ -78,7 +78,8 @@ class TestCollection(unittest.TestCase):
78
78
  'model_id': 'qwen2.5-7b-instruct',
79
79
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
80
  'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
- }
81
+ },
82
+ use_cache='outputs/20250519_114427'
82
83
  )
83
84
  res = run_task(task_cfg=task_cfg)
84
85
  print(res)