evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +10 -0
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +4 -2
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
- evalscope/benchmarks/tool_bench/utils.py +202 -0
- evalscope/benchmarks/utils.py +3 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/collections/evaluator.py +76 -26
- evalscope/config.py +46 -15
- evalscope/evaluator/evaluator.py +43 -15
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +3 -3
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +15 -19
- evalscope/perf/arguments.py +14 -5
- evalscope/perf/benchmark.py +0 -6
- evalscope/perf/main.py +65 -15
- evalscope/perf/utils/benchmark_util.py +33 -15
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/log_utils.py +1 -1
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/app.py +47 -34
- evalscope/report/utils.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
- tests/cli/test_all.py +3 -0
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +28 -12
- tests/perf/test_perf.py +23 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
evalscope/report/app.py
CHANGED
|
@@ -223,6 +223,33 @@ def plot_multi_report_radar(df: pd.DataFrame):
|
|
|
223
223
|
return fig
|
|
224
224
|
|
|
225
225
|
|
|
226
|
+
def convert_markdown_image(text):
|
|
227
|
+
if not os.path.isfile(text):
|
|
228
|
+
return text
|
|
229
|
+
# Convert the image path to a markdown image tag
|
|
230
|
+
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
231
|
+
text = os.path.abspath(text)
|
|
232
|
+
image_tag = f''
|
|
233
|
+
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
234
|
+
return image_tag
|
|
235
|
+
return text
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def convert_html_tags(text):
|
|
239
|
+
# match begin label
|
|
240
|
+
text = re.sub(r'<(\w+)>', r'[\1]', text)
|
|
241
|
+
# match end label
|
|
242
|
+
text = re.sub(r'</(\w+)>', r'[/\1]', text)
|
|
243
|
+
return text
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def process_string(string: str, max_length: int = 2048) -> str:
|
|
247
|
+
string = convert_html_tags(string) # for display labels e.g.
|
|
248
|
+
if max_length and len(string) > max_length:
|
|
249
|
+
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
250
|
+
return string
|
|
251
|
+
|
|
252
|
+
|
|
226
253
|
def dict_to_markdown(data) -> str:
|
|
227
254
|
markdown_lines = []
|
|
228
255
|
|
|
@@ -230,55 +257,41 @@ def dict_to_markdown(data) -> str:
|
|
|
230
257
|
bold_key = f'**{key}**'
|
|
231
258
|
|
|
232
259
|
if isinstance(value, list):
|
|
233
|
-
value_str = '\n' + '\n'.join([f'
|
|
260
|
+
value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
|
|
234
261
|
elif isinstance(value, dict):
|
|
235
262
|
value_str = dict_to_markdown(value)
|
|
236
263
|
else:
|
|
237
264
|
value_str = str(value)
|
|
238
265
|
|
|
239
|
-
value_str = process_string(value_str)
|
|
240
|
-
markdown_line = f'{bold_key}
|
|
266
|
+
value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
|
|
267
|
+
markdown_line = f'{bold_key}:\n{value_str}'
|
|
241
268
|
markdown_lines.append(markdown_line)
|
|
242
269
|
|
|
243
270
|
return '\n\n'.join(markdown_lines)
|
|
244
271
|
|
|
245
272
|
|
|
246
|
-
def
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
# match end label
|
|
250
|
-
text = re.sub(r'</(\w+)>', r'[/\1]', text)
|
|
251
|
-
return text
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
def convert_markdown_image(text):
|
|
255
|
-
if not os.path.isfile(text):
|
|
256
|
-
return text
|
|
257
|
-
# Convert the image path to a markdown image tag
|
|
258
|
-
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
259
|
-
text = os.path.abspath(text)
|
|
260
|
-
image_tag = f''
|
|
261
|
-
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
262
|
-
return image_tag
|
|
263
|
-
return text
|
|
273
|
+
def process_model_prediction(item: Any, max_length: int = 2048) -> str:
|
|
274
|
+
"""
|
|
275
|
+
Process model prediction output into a formatted string.
|
|
264
276
|
|
|
277
|
+
Args:
|
|
278
|
+
item: The item to process. Can be a string, list, or dictionary.
|
|
279
|
+
max_length: The maximum length of the output string.
|
|
265
280
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
270
|
-
return string
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
def process_model_prediction(item: Any):
|
|
281
|
+
Returns:
|
|
282
|
+
A formatted string representation of the input.
|
|
283
|
+
"""
|
|
274
284
|
if isinstance(item, dict):
|
|
275
|
-
|
|
276
|
-
return process_string(res)
|
|
285
|
+
result = dict_to_markdown(item)
|
|
277
286
|
elif isinstance(item, list):
|
|
278
|
-
|
|
279
|
-
return process_string(res)
|
|
287
|
+
result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
|
|
280
288
|
else:
|
|
281
|
-
|
|
289
|
+
result = str(item)
|
|
290
|
+
|
|
291
|
+
# Apply HTML tag conversion and truncation only at the final output
|
|
292
|
+
if max_length is not None:
|
|
293
|
+
return process_string(result, max_length)
|
|
294
|
+
return result
|
|
282
295
|
|
|
283
296
|
|
|
284
297
|
def normalize_score(score):
|
evalscope/report/utils.py
CHANGED
|
@@ -6,11 +6,12 @@ from typing import Union
|
|
|
6
6
|
from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
|
|
7
7
|
from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
|
|
8
8
|
from evalscope.utils import get_logger
|
|
9
|
+
from evalscope.utils.deprecation_utils import deprecated
|
|
9
10
|
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
12
13
|
|
|
13
|
-
|
|
14
|
+
@deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
|
|
14
15
|
def run_task(task_cfg: Union[str, dict]):
|
|
15
16
|
|
|
16
17
|
if isinstance(task_cfg, str):
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
from typing import Callable, Optional
|
|
4
|
+
|
|
5
|
+
from .logger import get_logger
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
|
|
11
|
+
"""
|
|
12
|
+
Decorator to mark functions as deprecated.
|
|
13
|
+
|
|
14
|
+
:param since: String indicating the version since deprecation
|
|
15
|
+
:param remove_in: Optional string indicating the version when it will be removed
|
|
16
|
+
:param alternative: Optional string suggesting an alternative
|
|
17
|
+
:return: Decorated function
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def decorator(func: Callable) -> Callable:
|
|
21
|
+
|
|
22
|
+
@functools.wraps(func)
|
|
23
|
+
def wrapper(*args, **kwargs):
|
|
24
|
+
# Get the file name where the function is defined
|
|
25
|
+
file_name = inspect.getfile(func)
|
|
26
|
+
|
|
27
|
+
# Construct the warning message
|
|
28
|
+
warning_parts = [
|
|
29
|
+
f'{func.__name__} in {file_name} has been deprecated since version {since}',
|
|
30
|
+
f'and will be removed in version {remove_in}' if remove_in else None,
|
|
31
|
+
f'Use {alternative} instead' if alternative else None
|
|
32
|
+
]
|
|
33
|
+
warning_message = '. '.join(filter(None, warning_parts))
|
|
34
|
+
|
|
35
|
+
# Log the warning
|
|
36
|
+
logger.warning(warning_message)
|
|
37
|
+
|
|
38
|
+
return func(*args, **kwargs)
|
|
39
|
+
|
|
40
|
+
return wrapper
|
|
41
|
+
|
|
42
|
+
return decorator
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.16.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -91,9 +91,10 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
|
91
91
|
Requires-Dist: aiohttp; extra == "all"
|
|
92
92
|
Requires-Dist: fastapi; extra == "all"
|
|
93
93
|
Requires-Dist: numpy; extra == "all"
|
|
94
|
+
Requires-Dist: rich; extra == "all"
|
|
94
95
|
Requires-Dist: sse-starlette; extra == "all"
|
|
95
96
|
Requires-Dist: transformers; extra == "all"
|
|
96
|
-
Requires-Dist:
|
|
97
|
+
Requires-Dist: uvicorn; extra == "all"
|
|
97
98
|
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
98
99
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
99
100
|
Requires-Dist: diffusers; extra == "all"
|
|
@@ -110,9 +111,10 @@ Provides-Extra: perf
|
|
|
110
111
|
Requires-Dist: aiohttp; extra == "perf"
|
|
111
112
|
Requires-Dist: fastapi; extra == "perf"
|
|
112
113
|
Requires-Dist: numpy; extra == "perf"
|
|
114
|
+
Requires-Dist: rich; extra == "perf"
|
|
113
115
|
Requires-Dist: sse-starlette; extra == "perf"
|
|
114
116
|
Requires-Dist: transformers; extra == "perf"
|
|
115
|
-
Requires-Dist:
|
|
117
|
+
Requires-Dist: uvicorn; extra == "perf"
|
|
116
118
|
Provides-Extra: rag
|
|
117
119
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
118
120
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
@@ -177,9 +179,23 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
177
179
|
|
|
178
180
|
## 📝 Introduction
|
|
179
181
|
|
|
180
|
-
EvalScope is [ModelScope](https://modelscope.cn/)
|
|
182
|
+
EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
|
|
181
183
|
|
|
182
|
-
|
|
184
|
+
- 🧠 Large Language Models
|
|
185
|
+
- 🎨 Multimodal Models
|
|
186
|
+
- 🔍 Embedding Models
|
|
187
|
+
- 🏆 Reranker Models
|
|
188
|
+
- 🖼️ CLIP Models
|
|
189
|
+
- 🎭 AIGC Models (Image-to-Text/Video)
|
|
190
|
+
- ...and more!
|
|
191
|
+
|
|
192
|
+
EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
|
|
193
|
+
|
|
194
|
+
- 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
|
|
195
|
+
- 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
|
|
196
|
+
- 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
|
|
197
|
+
|
|
198
|
+
Below is the overall architecture diagram of EvalScope:
|
|
183
199
|
|
|
184
200
|
<p align="center">
|
|
185
201
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
@@ -214,6 +230,8 @@ Please scan the QR code below to join our community groups:
|
|
|
214
230
|
|
|
215
231
|
## 🎉 News
|
|
216
232
|
|
|
233
|
+
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
234
|
+
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
217
235
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
218
236
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
219
237
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
@@ -479,26 +497,27 @@ For more customized evaluations, such as customizing model parameters or dataset
|
|
|
479
497
|
|
|
480
498
|
```shell
|
|
481
499
|
evalscope eval \
|
|
482
|
-
--model Qwen/
|
|
483
|
-
--model-args revision
|
|
484
|
-
--generation-config do_sample
|
|
500
|
+
--model Qwen/Qwen3-0.6B \
|
|
501
|
+
--model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
|
|
502
|
+
--generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
|
|
485
503
|
--dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
|
|
486
504
|
--datasets gsm8k \
|
|
487
505
|
--limit 10
|
|
488
506
|
```
|
|
489
507
|
|
|
490
|
-
### Parameter
|
|
491
|
-
- `--model-args`: Model loading parameters,
|
|
492
|
-
- `revision`: Model version
|
|
493
|
-
- `precision`: Model precision
|
|
494
|
-
- `device_map`:
|
|
495
|
-
- `--generation-config`: Generation parameters,
|
|
496
|
-
- `do_sample`: Whether to use sampling
|
|
497
|
-
- `
|
|
498
|
-
- `max_new_tokens`: Maximum length of
|
|
499
|
-
-
|
|
508
|
+
### Parameter Description
|
|
509
|
+
- `--model-args`: Model loading parameters, passed as a JSON string:
|
|
510
|
+
- `revision`: Model version
|
|
511
|
+
- `precision`: Model precision
|
|
512
|
+
- `device_map`: Device allocation for the model
|
|
513
|
+
- `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
|
|
514
|
+
- `do_sample`: Whether to use sampling
|
|
515
|
+
- `temperature`: Generation temperature
|
|
516
|
+
- `max_new_tokens`: Maximum length of generated tokens
|
|
517
|
+
- `chat_template_kwargs`: Model inference template parameters
|
|
518
|
+
- `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
|
|
500
519
|
- `few_shot_num`: Number of few-shot examples
|
|
501
|
-
- `few_shot_random`: Whether to randomly sample few-shot data
|
|
520
|
+
- `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
|
|
502
521
|
|
|
503
522
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
504
523
|
|
|
@@ -517,6 +536,11 @@ A stress testing tool focused on large language models, which can be customized
|
|
|
517
536
|
|
|
518
537
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
519
538
|
|
|
539
|
+
**Output example**
|
|
540
|
+
|
|
541
|
+

|
|
542
|
+
|
|
543
|
+
|
|
520
544
|
**Supports wandb for recording results**
|
|
521
545
|
|
|
522
546
|

|
|
@@ -565,7 +589,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
565
589
|
</a>
|
|
566
590
|
|
|
567
591
|
## 🔜 Roadmap
|
|
568
|
-
- [
|
|
592
|
+
- [x] Support for better evaluation report visualization
|
|
569
593
|
- [x] Support for mixed evaluations across multiple datasets
|
|
570
594
|
- [x] RAG evaluation
|
|
571
595
|
- [x] VLM evaluation
|
|
@@ -575,7 +599,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
575
599
|
- [x] Multi-modal evaluation
|
|
576
600
|
- [ ] Benchmarks
|
|
577
601
|
- [ ] GAIA
|
|
578
|
-
- [
|
|
602
|
+
- [x] GPQA
|
|
579
603
|
- [x] MBPP
|
|
580
604
|
|
|
581
605
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=fZW-om5E2_JaFcEmkvahvundjedPLgIDde-zwDXinG0,5868
|
|
3
|
+
evalscope/config.py,sha256=19QaZ5VS8wknt4sLBxiZkR6pH-nm4Ph3Kl-1bZgcQcE,10799
|
|
4
4
|
evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
|
|
5
5
|
evalscope/run.py,sha256=_DKbxgQGwhweBnQrI7lQhu5eoz4LYPVeNanzD4lHuJA,6476
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=8STVV6Y877B3esrgvovInSk4IFNzxZ_ZEz9ND_6B2lQ,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -50,14 +50,14 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVf
|
|
|
50
50
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
52
|
evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
|
|
53
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
53
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
|
|
54
54
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
55
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
|
|
57
57
|
evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
|
|
58
58
|
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
59
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
60
|
-
evalscope/benchmarks/utils.py,sha256=
|
|
59
|
+
evalscope/benchmarks/data_adapter.py,sha256=lcBoXhI1Byn0HcwbVxmIeUFxZlz_wiqte6RDPOR8sbM,18184
|
|
60
|
+
evalscope/benchmarks/utils.py,sha256=jB9w3mN1eOur6j2kpQB_XZJ912fhzC0GaSeHOoylK7M,1087
|
|
61
61
|
evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
62
|
evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
|
|
@@ -70,7 +70,7 @@ evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
70
70
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=GrIxCHpUwgUy8tXGTB7iQOt8k7wG8MJB0CWbwBmIy-8,1703
|
|
71
71
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=yxo5roCb8ryX9ROUU2FdZ-WBTUPZ14MrBzEL0zPOh-U,1718
|
|
72
72
|
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
|
-
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=
|
|
73
|
+
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=oUHpWrt5Gx0jF80RBd7zTh_1AWI66YvDd6U1vOMoqj0,3828
|
|
74
74
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
75
75
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
76
76
|
evalscope/benchmarks/arc/arc_adapter.py,sha256=0h-eT4BBmUJQrakKMPUNE1nSRwK6LHB-cflWpWzY978,6364
|
|
@@ -110,7 +110,7 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
|
|
|
110
110
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1ITBXI0f01Dt1p7sb2RGswIeg9685Bkk2S2xmA1vat8,11295
|
|
111
111
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
112
112
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
|
-
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=
|
|
113
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=Q6ncuLrCUrrhhljIfMsgWnyhHfcWWwh8iA6NZvz3W28,8079
|
|
114
114
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
115
115
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
116
116
|
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=r9zael_Y2Jso0ashevYpF8e5SHOBh8iMcPIJU5WT3pQ,10367
|
|
@@ -120,10 +120,13 @@ evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc
|
|
|
120
120
|
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=wgejW-_QswtT8_3JKAQ_H6svH8IotDJDBEH7X4nP4bY,6760
|
|
121
121
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
122
122
|
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3q53T-lu1UWTV6T4h1cKGoCQDh0O4QxFezw,2569
|
|
123
|
+
evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
|
+
evalscope/benchmarks/drop/drop_adapter.py,sha256=V-Vx6g2_1kcDUDWOKVX1vPSLt5iHn8NQkpWbsIwPaa4,8325
|
|
125
|
+
evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
|
|
123
126
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
127
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
|
|
125
128
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
126
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
129
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=40mZovspVf-OXcuEu3ei6G_HZlYA8whAHSESHPPONxA,4750
|
|
127
130
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
131
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
129
132
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -174,12 +177,15 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
|
|
|
174
177
|
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
175
178
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
176
179
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
177
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
180
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=TD7hkMLGZ4GK7wD7cwqJ3jCcTAaixOakUy3o5DaPYHI,8997
|
|
178
181
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
182
|
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
180
183
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
181
184
|
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
182
185
|
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
186
|
+
evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
187
|
+
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=l2dBcJ4Z3m-8QFtfyFH4IqMtvkY3Rfk021P9Ff_lXWQ,2270
|
|
188
|
+
evalscope/benchmarks/tool_bench/utils.py,sha256=vIPsL8FmMF2JZRHCZeLS_dDeATKNRvZDbq6T-Znlk8Q,7025
|
|
183
189
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
184
190
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
185
191
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
@@ -187,6 +193,8 @@ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb
|
|
|
187
193
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
188
194
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
189
195
|
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
|
|
196
|
+
evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
197
|
+
evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=UdANz3YmCtV2YfGuEihTe3vpUTlIxeXBhIqGkKbTFdU,1956
|
|
190
198
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
191
199
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
192
200
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
@@ -195,22 +203,22 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
195
203
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
196
204
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
197
205
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
198
|
-
evalscope/collections/evaluator.py,sha256=
|
|
206
|
+
evalscope/collections/evaluator.py,sha256=3sz_bL0HMFkxq3C-4P6rNGrnQolifVISI5sEpT3Bt90,15754
|
|
199
207
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
200
208
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
201
209
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
202
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
210
|
+
evalscope/evaluator/evaluator.py,sha256=QzTFXiv_WdPpWTB3PgBNIz9KS_Rxu-fWDvoUpML23aA,21651
|
|
203
211
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
204
212
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
205
213
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
|
|
206
214
|
evalscope/metrics/__init__.py,sha256=y1sdj5FBKYW1q5kLC6QREzoITHwstJRUdji6p0X5aAE,1363
|
|
207
|
-
evalscope/metrics/llm_judge.py,sha256=
|
|
215
|
+
evalscope/metrics/llm_judge.py,sha256=qYHsoBz-zXjL57Czl9CaPcyJT5SZr05giv5Q9SFK3cY,4000
|
|
208
216
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
209
217
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
210
218
|
evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
|
|
211
|
-
evalscope/metrics/rouge_metric.py,sha256=
|
|
219
|
+
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
212
220
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
213
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
221
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=m7v8ZysO9zCuyThEoGTe5QNVt2GsKMgZpH6du1FQCvg,12110
|
|
214
222
|
evalscope/metrics/t2v_metrics/__init__.py,sha256=GBxgKTPVy_qhW_F3M4Oi6QMWhdAi4PqGX5w3t6Tueho,1783
|
|
215
223
|
evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
|
|
216
224
|
evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
|
|
@@ -318,19 +326,19 @@ evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,630
|
|
|
318
326
|
evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
|
|
319
327
|
evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
|
|
320
328
|
evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
|
|
321
|
-
evalscope/models/adapters/chat_adapter.py,sha256=
|
|
329
|
+
evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
|
|
322
330
|
evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
|
|
323
331
|
evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
|
|
324
|
-
evalscope/models/adapters/server_adapter.py,sha256=
|
|
332
|
+
evalscope/models/adapters/server_adapter.py,sha256=d-0ne7ymWXmvKf_ypJ0093RNwplZJwhvU2xRwc8rt70,6581
|
|
325
333
|
evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
|
|
326
334
|
evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
|
|
327
335
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
328
336
|
evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
|
|
329
337
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
330
|
-
evalscope/perf/arguments.py,sha256=
|
|
331
|
-
evalscope/perf/benchmark.py,sha256=
|
|
338
|
+
evalscope/perf/arguments.py,sha256=5dTtaBR9BIobaKkX1Xj-mphHDG4uugnGaVOvWpLfN04,10714
|
|
339
|
+
evalscope/perf/benchmark.py,sha256=eGnxMLQXSYBGRJS4tS8geSJAirnuWo35M4orlRZzei8,7847
|
|
332
340
|
evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
|
|
333
|
-
evalscope/perf/main.py,sha256=
|
|
341
|
+
evalscope/perf/main.py,sha256=clHzkQNmv7wv-OWkuNGDQ-8YoLUCWxARIX-Eisinpms,3096
|
|
334
342
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
335
343
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
336
344
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -349,11 +357,12 @@ evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANB
|
|
|
349
357
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
350
358
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
351
359
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
352
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
353
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
360
|
+
evalscope/perf/utils/benchmark_util.py,sha256=PcRTeKlEIslBw0zKVS2mFg6GgJ6J8m1f2-gAaEBeiHI,7236
|
|
361
|
+
evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
|
|
354
362
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
355
363
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
356
|
-
evalscope/perf/utils/log_utils.py,sha256=
|
|
364
|
+
evalscope/perf/utils/log_utils.py,sha256=Xm5A8g8BaozaI_0TaPzr2aAxUBCCf-w7II-FcifrIYg,1503
|
|
365
|
+
evalscope/perf/utils/rich_display.py,sha256=SavP2L44UwN58ZUGR2W1wxM4h4F1iyPa90HhT-Ypkzs,8125
|
|
357
366
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
358
367
|
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
359
368
|
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
@@ -376,11 +385,11 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
376
385
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
377
386
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
378
387
|
evalscope/report/__init__.py,sha256=iLNqx7CnHSHQmOBqWUK_vt2VIjnvGslJTqn--7B4y_s,316
|
|
379
|
-
evalscope/report/app.py,sha256=
|
|
388
|
+
evalscope/report/app.py,sha256=FxNpiEmbpH_B7D5SYN42idGsyOgkgFrLzScOVrwL3SI,28998
|
|
380
389
|
evalscope/report/app_arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
|
|
381
390
|
evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
|
|
382
391
|
evalscope/report/generator.py,sha256=q9aHWNjQgvutAKtpjfWOpfu5zNFdnXilO9OqBqt_Phg,3612
|
|
383
|
-
evalscope/report/utils.py,sha256=
|
|
392
|
+
evalscope/report/utils.py,sha256=uu-rAzoN6ZIlv52IDWSZCcmNVY3DscNo2f9H9-gjZHY,4602
|
|
384
393
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
385
394
|
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
386
395
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
@@ -413,13 +422,14 @@ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2ee
|
|
|
413
422
|
evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
|
|
414
423
|
evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
|
|
415
424
|
evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
|
|
416
|
-
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=
|
|
425
|
+
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
|
|
417
426
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
418
427
|
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
|
|
419
428
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
420
429
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
421
430
|
evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
|
|
422
431
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
432
|
+
evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
|
|
423
433
|
evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
424
434
|
evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
|
|
425
435
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
@@ -431,11 +441,11 @@ tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
|
431
441
|
tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
432
442
|
tests/aigc/test_t2i.py,sha256=BcdS3OMypWnraXF4Cq3DhDVRpZq0qo9_0Qpyg54B7FY,2627
|
|
433
443
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
434
|
-
tests/cli/test_all.py,sha256=
|
|
435
|
-
tests/cli/test_collection.py,sha256=
|
|
436
|
-
tests/cli/test_run.py,sha256=
|
|
444
|
+
tests/cli/test_all.py,sha256=O3lXwOV7A0f0rmltofrjpphnshjNtaZC6NUPG-wsQjg,4082
|
|
445
|
+
tests/cli/test_collection.py,sha256=_11mSCWLaiCgheA3uguv6uI3CxqaHUKVwzS6T5BGmxs,4145
|
|
446
|
+
tests/cli/test_run.py,sha256=FTFiAb8Ge5raB1aa0Nzw8DPjFLyAlLfXHRQVIWjvvGE,17798
|
|
437
447
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
438
|
-
tests/perf/test_perf.py,sha256=
|
|
448
|
+
tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
|
|
439
449
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
440
450
|
tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
|
|
441
451
|
tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
|
|
@@ -446,9 +456,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
446
456
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
447
457
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
448
458
|
tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
|
|
449
|
-
evalscope-0.
|
|
450
|
-
evalscope-0.
|
|
451
|
-
evalscope-0.
|
|
452
|
-
evalscope-0.
|
|
453
|
-
evalscope-0.
|
|
454
|
-
evalscope-0.
|
|
459
|
+
evalscope-0.16.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
460
|
+
evalscope-0.16.0.dist-info/METADATA,sha256=zX2L_cLxOjX-NNbiR40dmPOxUWyOH86zJycYjr4j5Po,35492
|
|
461
|
+
evalscope-0.16.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
462
|
+
evalscope-0.16.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
463
|
+
evalscope-0.16.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
464
|
+
evalscope-0.16.0.dist-info/RECORD,,
|
tests/cli/test_all.py
CHANGED
tests/cli/test_collection.py
CHANGED
|
@@ -78,7 +78,8 @@ class TestCollection(unittest.TestCase):
|
|
|
78
78
|
'model_id': 'qwen2.5-7b-instruct',
|
|
79
79
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
80
|
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
|
-
}
|
|
81
|
+
},
|
|
82
|
+
use_cache='outputs/20250519_114427'
|
|
82
83
|
)
|
|
83
84
|
res = run_task(task_cfg=task_cfg)
|
|
84
85
|
print(res)
|