evalscope 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
- evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
- evalscope/benchmarks/race/race_adapter.py +2 -1
- evalscope/config.py +35 -1
- evalscope/constants.py +24 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +5 -4
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
- evalscope/perf/arguments.py +2 -1
- evalscope/perf/benchmark.py +2 -2
- evalscope/perf/main.py +2 -5
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +3 -1
- evalscope/run.py +45 -82
- evalscope/run_arena.py +2 -1
- evalscope/summarizer.py +14 -26
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/tools/combine_reports.py +2 -4
- evalscope/tools/rewrite_eval_results.py +1 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +8 -0
- evalscope/utils/utils.py +0 -175
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/METADATA +1 -1
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/RECORD +46 -46
- tests/cli/test_run.py +11 -12
- tests/perf/test_perf.py +2 -1
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/utils/utils.py
CHANGED
|
@@ -5,19 +5,13 @@ import functools
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import importlib
|
|
7
7
|
import importlib.util
|
|
8
|
-
import json
|
|
9
|
-
import jsonlines as jsonl
|
|
10
8
|
import numpy as np
|
|
11
9
|
import os
|
|
12
10
|
import random
|
|
13
11
|
import re
|
|
14
|
-
import sys
|
|
15
12
|
import torch
|
|
16
|
-
import torch.nn.functional as F
|
|
17
|
-
import yaml
|
|
18
13
|
from typing import Any, Dict, List, Tuple, Union
|
|
19
14
|
|
|
20
|
-
from evalscope.constants import DumpMode
|
|
21
15
|
from evalscope.utils.logger import get_logger
|
|
22
16
|
|
|
23
17
|
logger = get_logger()
|
|
@@ -36,102 +30,6 @@ def test_level_list():
|
|
|
36
30
|
return TEST_LEVEL_LIST
|
|
37
31
|
|
|
38
32
|
|
|
39
|
-
def jsonl_to_list(jsonl_file):
|
|
40
|
-
"""
|
|
41
|
-
Read jsonl file to list.
|
|
42
|
-
|
|
43
|
-
Args:
|
|
44
|
-
jsonl_file: jsonl file path.
|
|
45
|
-
|
|
46
|
-
Returns:
|
|
47
|
-
list: list of lines. Each line is a dict.
|
|
48
|
-
"""
|
|
49
|
-
res_list = []
|
|
50
|
-
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
51
|
-
for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
|
|
52
|
-
res_list.append(line)
|
|
53
|
-
return res_list
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def jsonl_to_reader(jsonl_file):
|
|
57
|
-
"""
|
|
58
|
-
Read jsonl file to reader object.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
jsonl_file: jsonl file path.
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
reader: jsonl reader object.
|
|
65
|
-
"""
|
|
66
|
-
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
67
|
-
return reader
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def jsonl_to_csv():
|
|
71
|
-
pass
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
|
|
75
|
-
"""
|
|
76
|
-
Dump data to jsonl file.
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
data_list: data list to be dumped. [{'a': 'aaa'}, ...]
|
|
80
|
-
jsonl_file: jsonl file path.
|
|
81
|
-
dump_mode: dump mode. It can be 'overwrite' or 'append'.
|
|
82
|
-
"""
|
|
83
|
-
if not jsonl_file:
|
|
84
|
-
raise ValueError('output file must be provided.')
|
|
85
|
-
|
|
86
|
-
jsonl_file = os.path.expanduser(jsonl_file)
|
|
87
|
-
|
|
88
|
-
if not isinstance(data_list, list):
|
|
89
|
-
data_list = [data_list]
|
|
90
|
-
|
|
91
|
-
if dump_mode == DumpMode.OVERWRITE:
|
|
92
|
-
dump_mode = 'w'
|
|
93
|
-
elif dump_mode == DumpMode.APPEND:
|
|
94
|
-
dump_mode = 'a'
|
|
95
|
-
with jsonl.open(jsonl_file, mode=dump_mode) as writer:
|
|
96
|
-
writer.write_all(data_list)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def yaml_to_dict(yaml_file) -> dict:
|
|
100
|
-
"""
|
|
101
|
-
Read yaml file to dict.
|
|
102
|
-
"""
|
|
103
|
-
with open(yaml_file, 'r') as f:
|
|
104
|
-
try:
|
|
105
|
-
stream = yaml.safe_load(f)
|
|
106
|
-
except yaml.YAMLError as e:
|
|
107
|
-
logger.error(f'{e}')
|
|
108
|
-
raise e
|
|
109
|
-
|
|
110
|
-
return stream
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def dict_to_yaml(d: dict, yaml_file: str):
|
|
114
|
-
"""
|
|
115
|
-
Dump dict to yaml file.
|
|
116
|
-
"""
|
|
117
|
-
with open(yaml_file, 'w') as f:
|
|
118
|
-
yaml.dump(d, f, default_flow_style=False)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def json_to_dict(json_file) -> dict:
|
|
122
|
-
"""
|
|
123
|
-
Read json file to dict.
|
|
124
|
-
"""
|
|
125
|
-
with open(json_file, 'r') as f:
|
|
126
|
-
try:
|
|
127
|
-
stream = json.load(f)
|
|
128
|
-
except json.JSONDecodeError as e:
|
|
129
|
-
logger.error(f'{e}')
|
|
130
|
-
raise e
|
|
131
|
-
|
|
132
|
-
return stream
|
|
133
|
-
|
|
134
|
-
|
|
135
33
|
def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
|
|
136
34
|
module_name, spliter, cls_name = eval_class_ref.partition(':')
|
|
137
35
|
|
|
@@ -300,18 +198,6 @@ class ResponseParser:
|
|
|
300
198
|
return ''
|
|
301
199
|
|
|
302
200
|
|
|
303
|
-
def make_outputs_dir(root_dir: str, datasets: list, model_id: str, model_revision: str):
|
|
304
|
-
if not model_id:
|
|
305
|
-
model_id = 'default'
|
|
306
|
-
model_id = model_id.replace('/', '_')
|
|
307
|
-
|
|
308
|
-
if not model_revision:
|
|
309
|
-
model_revision = 'default'
|
|
310
|
-
|
|
311
|
-
outputs_dir = os.path.join(root_dir, model_id, model_revision, f"eval_{'-'.join(datasets)}")
|
|
312
|
-
|
|
313
|
-
return outputs_dir
|
|
314
|
-
|
|
315
201
|
|
|
316
202
|
def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
|
|
317
203
|
"""
|
|
@@ -355,67 +241,6 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
|
|
|
355
241
|
return score
|
|
356
242
|
|
|
357
243
|
|
|
358
|
-
def split_str_parts_by(text: str, delimiters: List[str]):
|
|
359
|
-
"""Split the text field into parts.
|
|
360
|
-
Args:
|
|
361
|
-
text: A text to be split.
|
|
362
|
-
delimiters: The delimiters.
|
|
363
|
-
Returns:
|
|
364
|
-
The split text in list of dicts.
|
|
365
|
-
"""
|
|
366
|
-
all_start_chars = [d[0] for d in delimiters]
|
|
367
|
-
all_length = [len(d) for d in delimiters]
|
|
368
|
-
|
|
369
|
-
text_list = []
|
|
370
|
-
last_words = ''
|
|
371
|
-
|
|
372
|
-
while len(text) > 0:
|
|
373
|
-
for char_idx, char in enumerate(text):
|
|
374
|
-
match_index = [idx for idx, start_char in enumerate(all_start_chars) if start_char == char]
|
|
375
|
-
is_delimiter = False
|
|
376
|
-
for index in match_index:
|
|
377
|
-
if text[char_idx:char_idx + all_length[index]] == delimiters[index]:
|
|
378
|
-
if last_words:
|
|
379
|
-
if text_list:
|
|
380
|
-
text_list[-1]['content'] = last_words
|
|
381
|
-
else:
|
|
382
|
-
text_list.append({'key': '', 'content': last_words})
|
|
383
|
-
last_words = ''
|
|
384
|
-
text_list.append({'key': delimiters[index]})
|
|
385
|
-
text = text[char_idx + all_length[index]:]
|
|
386
|
-
is_delimiter = True
|
|
387
|
-
break
|
|
388
|
-
if not is_delimiter:
|
|
389
|
-
last_words += char
|
|
390
|
-
else:
|
|
391
|
-
break
|
|
392
|
-
if last_words == text:
|
|
393
|
-
text = ''
|
|
394
|
-
|
|
395
|
-
text_list[-1]['content'] = last_words
|
|
396
|
-
return text_list
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
def get_bucket_sizes(max_length: int) -> List[int]:
|
|
400
|
-
return [max_length // 4 * (i + 1) for i in range(4)]
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
def _get_closet_bucket(bucket_sizes, data_length):
|
|
404
|
-
"""Select the one from bucket_sizes that is closest in distance to
|
|
405
|
-
data_length. This is required for TorchAcc.
|
|
406
|
-
"""
|
|
407
|
-
cloest_length = sys.maxsize
|
|
408
|
-
for b in bucket_sizes:
|
|
409
|
-
if b == data_length or ((b < cloest_length) and (b > data_length)):
|
|
410
|
-
cloest_length = b
|
|
411
|
-
|
|
412
|
-
if cloest_length == sys.maxsize:
|
|
413
|
-
bucket_sizes.append(data_length)
|
|
414
|
-
cloest_length = data_length
|
|
415
|
-
|
|
416
|
-
return cloest_length
|
|
417
|
-
|
|
418
|
-
|
|
419
244
|
def is_module_installed(module_name):
|
|
420
245
|
try:
|
|
421
246
|
importlib.import_module(module_name)
|
evalscope/version.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=RY0EjssSquqqsysRobElYm9Ix6E41uTXeaeh7lI7kqs,106
|
|
2
2
|
evalscope/arguments.py,sha256=nozBnog45l77jxTFH_lyyJkj04ER3yyIpICepc2tC1Y,3783
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
6
|
-
evalscope/run_arena.py,sha256=
|
|
7
|
-
evalscope/summarizer.py,sha256=
|
|
8
|
-
evalscope/version.py,sha256=
|
|
3
|
+
evalscope/config.py,sha256=ZDN0XVCCXMSSD675Smzm57fNDOx-cZTsNvPboMtYVow,8407
|
|
4
|
+
evalscope/constants.py,sha256=M5qJ8b7kp-RF52IwBjx5EMjeuiH1e1jdollCsbIT-c4,3753
|
|
5
|
+
evalscope/run.py,sha256=s_qE1ukrt4HBfRVAPJjC1XiqD9k7rSH7lX8yysyf5do,7279
|
|
6
|
+
evalscope/run_arena.py,sha256=6nc_S8KL7B3V4SsnpIexfvczHN9kQwHR9R1GXb2sqgI,8586
|
|
7
|
+
evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
|
|
8
|
+
evalscope/version.py,sha256=OXwZDg6ML1mbsIw-CBhWRf4zVz2ArW2PFzzLK9FVAZk,118
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
evalscope/backend/base.py,sha256=
|
|
10
|
+
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
12
12
|
evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
|
|
13
13
|
evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-oyH3uMGL7M3nPp1WiHU,10381
|
|
@@ -68,8 +68,8 @@ evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGy
|
|
|
68
68
|
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=B5ZETlQw5XTEDnO-VR5yXjSbbg1eUtjGts7M5msK2ik,5618
|
|
69
69
|
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_YF82SXLpkxoJ4nUurmdKSEoJ-qsLY,2129
|
|
70
70
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
-
evalscope/backend/rag_eval/utils/clip.py,sha256=
|
|
72
|
-
evalscope/backend/rag_eval/utils/embedding.py,sha256=
|
|
71
|
+
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
72
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
|
|
73
73
|
evalscope/backend/rag_eval/utils/llm.py,sha256=619eP8pXUcwIBaktBrGNA17j53j9jfg_1JeFDYzMCIE,2582
|
|
74
74
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
75
75
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
@@ -122,23 +122,23 @@ evalscope/benchmarks/competition_math/__init__.py,sha256=CDK03RXT-X21WcIAlkrCs0r
|
|
|
122
122
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
123
123
|
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=cHWJ6LLIWvftFXjGrOidMlZ1RGUFxPgDjs4wmBPSm1Y,18862
|
|
124
124
|
evalscope/benchmarks/general_qa/__init__.py,sha256=N2t-ehNrl9eVAarlSgJvRapm9yOjhfCWhNPPfcUUy-s,409
|
|
125
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
125
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=cSW0Mq9__-gh-tVoVXD9Rk6h3h2iZW-Fu3RQ16haJhQ,5878
|
|
126
126
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=CtcG_QM8m5zmvMs2N53d7kcm4_hIgsO2qYPyx-71aLw,313
|
|
127
127
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
128
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
128
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=KBI9t5F7XW1Cs44QUA7ultkfsXxLyucH9zNYe-jOQQk,13866
|
|
129
129
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=cY1kluaTqC7AvyzwlQYc3BF_kB3LD1gOpg6i7RDr0cI,415
|
|
130
130
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
131
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
131
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=IIesSMPw1Yya4-LjqJt1QVkpOx8RGKwBYTQtmc0VfaQ,8495
|
|
132
132
|
evalscope/benchmarks/humaneval/__init__.py,sha256=lqSlAf1-8Nzhc1j89sj6yAcaLt9pGhqu15M84bmzamc,333
|
|
133
133
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
134
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
134
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=VAO7siedusq9z3b1J3ztFE4XDopYKqmwe2n-Numg7HY,9149
|
|
135
135
|
evalscope/benchmarks/mmlu/__init__.py,sha256=OGiN1J80WDM72y242o7diYT9Rl-jkVEqTNntCl8Vt4M,385
|
|
136
136
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
137
137
|
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8T-fN_Az0gWOyME9nHl3MvcD144TjWknFKcEOMHppAI,15494
|
|
138
138
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
139
139
|
evalscope/benchmarks/race/__init__.py,sha256=HVda-CB-Q-N8RbwiVLADXYNY6VLUH-frJ8VCc3jm0Mk,385
|
|
140
140
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
141
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
141
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=WgnWYSctc3VtWm2FAeVDTlxR2hwXsF2tala7n66f5mw,9841
|
|
142
142
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
143
143
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=eLMVC6tfwty5HqrQuGyWeAF2IhRNajWoO1SkLVemQj4,409
|
|
144
144
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
@@ -153,12 +153,11 @@ evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
|
|
|
153
153
|
evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
|
|
154
154
|
evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
|
|
155
155
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
156
|
-
evalscope/evaluator/__init__.py,sha256=
|
|
157
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
158
|
-
evalscope/evaluator/
|
|
159
|
-
evalscope/evaluator/rating_eval.py,sha256=VuDIZcmSlsv1tc8znDGesz8ZwpQ7NvZJPv823Quvht0,5566
|
|
156
|
+
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
157
|
+
evalscope/evaluator/evaluator.py,sha256=nRR6aaa9J8nRfB8QPZwexSrfKDvPkPSGQpFVpbWLeW0,18380
|
|
158
|
+
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
160
159
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
161
|
-
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=
|
|
160
|
+
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=nL8k-i92L1iMwjPOnNxzQyZICfukZKJul4ZBvOWkHGw,16414
|
|
162
161
|
evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
163
162
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
164
163
|
evalscope/metrics/math_accuracy.py,sha256=WqLfACuIeVFrX4q6_c2exnTLn2t10-rjv6sfxcqJJ14,1965
|
|
@@ -178,17 +177,17 @@ evalscope/models/api/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfq
|
|
|
178
177
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
179
178
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
180
179
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
181
|
-
evalscope/perf/arguments.py,sha256=
|
|
182
|
-
evalscope/perf/benchmark.py,sha256=
|
|
180
|
+
evalscope/perf/arguments.py,sha256=J067vNJF-RObJNZ0oE2RBIBNjliCYcflWtt6aGAt40g,9205
|
|
181
|
+
evalscope/perf/benchmark.py,sha256=h151QXsVbg7lMe09aH_mxUdPRALIl1A35I9VO2zryEo,9615
|
|
183
182
|
evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
|
|
184
|
-
evalscope/perf/main.py,sha256
|
|
183
|
+
evalscope/perf/main.py,sha256=2GrE9wHibprzaw4gmcovdc5ods_EHwoSwwmkFDLTUjQ,1257
|
|
185
184
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
186
|
-
evalscope/perf/plugin/registry.py,sha256=
|
|
185
|
+
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
187
186
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
188
187
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
189
188
|
evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
|
|
190
189
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
191
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
190
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=raa4SaatEphNfWuK6_3ecfe49Vg4yftD6C-enhufJuE,7020
|
|
192
191
|
evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
|
|
193
192
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
194
193
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
@@ -199,10 +198,10 @@ evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1
|
|
|
199
198
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
200
199
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
201
200
|
evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0awWiXEIkCI,1212
|
|
202
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
203
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
201
|
+
evalscope/perf/utils/benchmark_util.py,sha256=T_pXpSCwCNLJgfzgv3IO7kG61ghTLthVMsXZhBCGP_4,5541
|
|
202
|
+
evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
|
|
204
203
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
205
|
-
evalscope/perf/utils/local_server.py,sha256=
|
|
204
|
+
evalscope/perf/utils/local_server.py,sha256=A26gqBbxsnZA8CqQospyO50x3prVnD9XiT2l--ERxK0,4566
|
|
206
205
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
207
206
|
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
208
207
|
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
@@ -229,9 +228,9 @@ evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86J
|
|
|
229
228
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
230
229
|
evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
|
|
231
230
|
evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
|
|
232
|
-
evalscope/third_party/longbench_write/eval.py,sha256=
|
|
231
|
+
evalscope/third_party/longbench_write/eval.py,sha256=39McZSDHL7bA5Dg-BSyZ4EiAF1nfTiYJAnx5FqbNYok,11265
|
|
233
232
|
evalscope/third_party/longbench_write/infer.py,sha256=bFsOp--8Qn6qQ-NpdLY0bennQGQl5TMGEngvGda8k7g,4937
|
|
234
|
-
evalscope/third_party/longbench_write/longbench_write.py,sha256=
|
|
233
|
+
evalscope/third_party/longbench_write/longbench_write.py,sha256=nIR1toB1hvUXR7Lrs3xcY9wqaI-bjeADg_Oscf3HdaY,3991
|
|
235
234
|
evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
|
|
236
235
|
evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
237
236
|
evalscope/third_party/longbench_write/resources/judge.txt,sha256=Go1ISY4bUBmEDXXY_DItjAmskuHSaRj5WTNMNH98FSk,1885
|
|
@@ -239,7 +238,7 @@ evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26
|
|
|
239
238
|
evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
|
|
240
239
|
evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
|
|
241
240
|
evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
242
|
-
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=
|
|
241
|
+
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
|
|
243
242
|
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
244
243
|
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
245
244
|
evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
|
|
@@ -247,26 +246,27 @@ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2ee
|
|
|
247
246
|
evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
|
|
248
247
|
evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
|
|
249
248
|
evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
|
|
250
|
-
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=
|
|
249
|
+
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
|
|
251
250
|
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
252
251
|
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
|
|
253
252
|
evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
254
|
-
evalscope/tools/combine_reports.py,sha256=
|
|
253
|
+
evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
|
|
255
254
|
evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
|
|
256
|
-
evalscope/tools/rewrite_eval_results.py,sha256=
|
|
257
|
-
evalscope/utils/__init__.py,sha256=
|
|
255
|
+
evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
256
|
+
evalscope/utils/__init__.py,sha256=ZOri8VHx8LpJBJS90uw8h0Z7gPhtxhjWlBPWuuZgoRE,121
|
|
258
257
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
259
|
-
evalscope/utils/chat_service.py,sha256=
|
|
258
|
+
evalscope/utils/chat_service.py,sha256=VdNPXdFSf-4zxe0Ht74LBcdRNbpb9vzVi86HDEqfXHc,8647
|
|
260
259
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
261
|
-
evalscope/utils/
|
|
260
|
+
evalscope/utils/io_utils.py,sha256=MnEi4llOYtXK81bUQ_XE_WP5qIsVrJ4MlKmWMH9vzFs,3993
|
|
261
|
+
evalscope/utils/logger.py,sha256=4OGlkBsut_wzq-1UcM2DKQKdKs1FRNYGHw538TGvypU,3440
|
|
262
262
|
evalscope/utils/model_utils.py,sha256=zMS1YRu4CzU4CVLZS6e_lgfHIDBqv3YBTJbPF1R2M90,443
|
|
263
|
-
evalscope/utils/utils.py,sha256=
|
|
263
|
+
evalscope/utils/utils.py,sha256=lZl5lt4WqjoY5SEfsum8Sc-s_c9GSlmIZlkTAQkMnjE,10485
|
|
264
264
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
265
265
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
266
266
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
267
|
-
tests/cli/test_run.py,sha256=
|
|
267
|
+
tests/cli/test_run.py,sha256=pMZvI3b0Vs-UFfciDoPwCYFAaYJzocQjxEaMLFTxYSo,4289
|
|
268
268
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
269
|
-
tests/perf/test_perf.py,sha256=
|
|
269
|
+
tests/perf/test_perf.py,sha256=AQB2QuMwJ1TnenHFPBF4YAtifbR0D0pSobP6xmDysqw,3023
|
|
270
270
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
271
271
|
tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
|
|
272
272
|
tests/rag/test_mteb.py,sha256=CaEJ0f1M06Z90c72FQb9z23IC_KZtkURWsc_oRMgQn8,4609
|
|
@@ -276,10 +276,10 @@ tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p
|
|
|
276
276
|
tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
|
|
277
277
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
278
278
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
279
|
-
tests/vlm/test_vlmeval.py,sha256=
|
|
280
|
-
evalscope-0.8.
|
|
281
|
-
evalscope-0.8.
|
|
282
|
-
evalscope-0.8.
|
|
283
|
-
evalscope-0.8.
|
|
284
|
-
evalscope-0.8.
|
|
285
|
-
evalscope-0.8.
|
|
279
|
+
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
280
|
+
evalscope-0.8.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
281
|
+
evalscope-0.8.1.dist-info/METADATA,sha256=HydrEYb1OxbvVUMl11oLekV2sjvlgQQvtEpkcNAiW5A,23190
|
|
282
|
+
evalscope-0.8.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
283
|
+
evalscope-0.8.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
284
|
+
evalscope-0.8.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
285
|
+
evalscope-0.8.1.dist-info/RECORD,,
|
tests/cli/test_run.py
CHANGED
|
@@ -70,7 +70,7 @@ class TestRun(unittest.TestCase):
|
|
|
70
70
|
|
|
71
71
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
72
72
|
def test_run_task(self):
|
|
73
|
-
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['gsm8k'], 'limit': 2, 'debug': False}
|
|
73
|
+
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'gsm8k', 'arc'], 'limit': 2, 'debug': False}
|
|
74
74
|
run_task(task_cfg=task_cfg)
|
|
75
75
|
|
|
76
76
|
|
|
@@ -80,33 +80,32 @@ class TestRun(unittest.TestCase):
|
|
|
80
80
|
|
|
81
81
|
task_cfg = TaskConfig(
|
|
82
82
|
model='qwen/Qwen2-0.5B-Instruct',
|
|
83
|
-
datasets=['ceval'], # 数据格式,选择题格式固定为 'ceval'
|
|
83
|
+
datasets=['ceval', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
|
|
84
84
|
dataset_args={
|
|
85
85
|
'ceval': {
|
|
86
86
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
87
87
|
'subset_list': [
|
|
88
88
|
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
89
89
|
]
|
|
90
|
+
},
|
|
91
|
+
'general_qa': {
|
|
92
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
93
|
+
'subset_list': [
|
|
94
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
95
|
+
]
|
|
90
96
|
}
|
|
91
97
|
},
|
|
92
98
|
)
|
|
93
99
|
run_task(task_cfg=task_cfg)
|
|
94
100
|
|
|
95
101
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
96
|
-
def
|
|
102
|
+
def test_run_humaneval(self):
|
|
97
103
|
from evalscope.config import TaskConfig
|
|
98
104
|
|
|
99
105
|
task_cfg = TaskConfig(
|
|
100
106
|
model='qwen/Qwen2-0.5B-Instruct',
|
|
101
|
-
datasets=['
|
|
102
|
-
|
|
103
|
-
'general_qa': {
|
|
104
|
-
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
105
|
-
'subset_list': [
|
|
106
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
107
|
-
]
|
|
108
|
-
}
|
|
109
|
-
},
|
|
107
|
+
datasets=['humaneval'],
|
|
108
|
+
limit=2
|
|
110
109
|
)
|
|
111
110
|
|
|
112
111
|
run_task(task_cfg=task_cfg)
|
tests/perf/test_perf.py
CHANGED
|
@@ -25,6 +25,7 @@ class TestPerf(unittest.TestCase):
|
|
|
25
25
|
'number': 15,
|
|
26
26
|
'api': 'openai',
|
|
27
27
|
'dataset': 'openqa',
|
|
28
|
+
'stream': True,
|
|
28
29
|
'debug': True,
|
|
29
30
|
}
|
|
30
31
|
run_perf_benchmark(task_cfg)
|
|
@@ -46,7 +47,7 @@ class TestPerf(unittest.TestCase):
|
|
|
46
47
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
47
48
|
def test_run_perf_speed_benchmark(self):
|
|
48
49
|
task_cfg = {
|
|
49
|
-
'url': 'http://127.0.0.1:
|
|
50
|
+
'url': 'http://127.0.0.1:8801/v1/completions',
|
|
50
51
|
'parallel': 1,
|
|
51
52
|
'model': 'qwen2.5',
|
|
52
53
|
'api': 'openai',
|
tests/vlm/test_vlmeval.py
CHANGED
|
@@ -40,8 +40,9 @@ class TestVLMEval(unittest.TestCase):
|
|
|
40
40
|
}], # model name for VLMEval config
|
|
41
41
|
'nproc': 1,
|
|
42
42
|
'reuse': True,
|
|
43
|
-
|
|
44
|
-
|
|
43
|
+
},
|
|
44
|
+
'work_dir': 'outputs',
|
|
45
|
+
'use_cache': 'outputs/20241216_142838'
|
|
45
46
|
}
|
|
46
47
|
|
|
47
48
|
logger.info(f'>> Start to run task: {task_cfg}')
|